Crawl4AI 使用 AsyncWebCrawler 进行 LLM 提取

60
0
0
2024-11-11

Crawl4AI 使用 AsyncWebCrawler 进行 LLM 提取

Crawl4AI 的 AsyncWebCrawler 允许您使用语言模型(LLM)异步地从网页提取结构化数据或相关内容。以下两个示例展示了如何使用 AsyncWebCrawler 和 LLMExtractionStrategy 实现不同目的。

示例 1:提取结构化数据

在这个示例中,我们使用 LLMExtractionStrategy 从 OpenAI 定价页面提取结构化数据(模型名称及其费用)。

python
import os
import json
import asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from pydantic import BaseModel, Field

class OpenAIModelFee(BaseModel):
    model_name: str = Field(..., description="OpenAI 模型的名称。")
    input_fee: str = Field(..., description="OpenAI 模型的输入令牌费用。")
    output_fee: str = Field(..., description="OpenAI 模型的输出令牌费用。")

async def extract_openai_fees():
    url = 'https://openai.com/api/pricing/'

    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(
            url=url,
            word_count_threshold=1,
            extraction_strategy=LLMExtractionStrategy(
                provider="openai/gpt-4o",  # 或使用 ollama 如 provider="ollama/nemotron"
                api_token=os.getenv('OPENAI_API_KEY'),
                schema=OpenAIModelFee.model_json_schema(),
                extraction_type="schema",
                instruction="从爬取的内容中提取所有提到的模型名称及其输入和输出令牌的费用。确保不要遗漏整个内容中的任何内容。一个提取的模型 JSON 格式应如下所示:'
                            '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
            ),
            bypass_cache=True,
        )

    model_fees = json.loads(result.extracted_content)
    print(f"提取的模型数量: {len(model_fees)}")

    with open(".data/openai_fees.json", "w", encoding="utf-8") as f:
        json.dump(model_fees, f, indent=2)

asyncio.run(extract_openai_fees())

示例 2:提取相关内容

在这个示例中,我们指示 LLM 仅提取与 NBC 新闻商业页面上技术相关的内容。

python
import os
import json
import asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import LLMExtractionStrategy

async def extract_tech_content():
    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(
            url="https://www.nbcnews.com/business", 
            extraction_strategy=LLMExtractionStrategy(
                provider="openai/gpt-4o",
                api_token=os.getenv('OPENAI_API_KEY'),
                instruction="仅提取与科技相关的内容"
            ),
            bypass_cache=True,
        )

    tech_content = json.loads(result.extracted_content)
    print(f"提取的科技相关内容数量: {len(tech_content)}")

    with open(".data/tech_content.json", "w", encoding="utf-8") as f:
        json.dump(tech_content, f, indent=2)

asyncio.run(extract_tech_content())

高级用法:结合 JS 执行与 LLM 提取

这个示例展示了如何将 JavaScript 执行与 LLM 提取结合起来处理动态内容:

python
async def extract_dynamic_content():
    js_code = """
    const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
    if (loadMoreButton) {
        loadMoreButton.click();
        await new Promise(resolve => setTimeout(resolve, 2000));
    }
    """

    wait_for = """
    () => {
        const articles = document.querySelectorAll('article.tease-card');
        return articles.length > 10;
    }
    """

    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(
            url="https://www.nbcnews.com/business", 
            js_code=js_code,
            wait_for=wait_for,
            css_selector="article.tease-card",
            extraction_strategy=LLMExtractionStrategy(
                provider="openai/gpt-4o",
                api_token=os.getenv('OPENAI_API_KEY'),
                instruction="总结每篇文章,重点关注与技术相关的内容"
            ),
            bypass_cache=True,
        )

    summaries = json.loads(result.extracted_content)
    print(f"总结的文章数量: {len(summaries)}")

    with open(".data/tech_summaries.json", "w", encoding="utf-8") as f:
        json.dump(summaries, f, indent=2)

asyncio.run(extract_dynamic_content())

自定义 LLM 提供商

Crawl4AI 使用 litellm 库在背后,允许您使用任何您想要的 LLM 提供商。只需传递正确的模型名称和 API 令牌:

python
extraction_strategy=LLMExtractionStrategy(
    provider="your_llm_provider/model_name",
    api_token="your_api_token",
    instruction="你的提取指令"
)

这种灵活性允许您与各种 LLM 提供商集成,并根据您的特定需求定制提取过程。

错误处理和重试

在使用外部 LLM API 时,处理潜在错误并实现重试逻辑非常重要。以下是如何做到这一点的示例:

python
import asyncio
from tenacity import retry, stop_after_attempt, wait_exponential

class LLMExtractionError(Exception):
    pass

@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
async def extract_with_retry(crawler, url, extraction_strategy):
    try:
        result = await crawler.arun(url=url, extraction_strategy=extraction_strategy, bypass_cache=True)
        return json.loads(result.extracted_content)
    except Exception as e:
        raise LLMExtractionError(f"提取内容失败: {str(e)}")

async def main():
    async with AsyncWebCrawler(verbose=True) as crawler:
        try:
            content = await extract_with_retry(
                crawler,
                "https://www.example.com", 
                LLMExtractionStrategy(
                    provider="openai/gpt-4o",
                    api_token=os.getenv('OPENAI_API_KEY'),
                    instruction="提取并总结主要观点"
                )
            )
            print("提取的内容:", content)
        except LLMExtractionError as e:
            print(f"重试后提取失败: {e}")

asyncio.run(main())

这个示例使用 tenacity 库实现了一个带有指数退避的重试机制,这可以帮助处理来自 LLM API 的临时失败或速率限制。