Crawl4AI 快速入门指南

129
0
0
2024-11-11

Crawl4AI 快速入门指南

开始 🛠️

首先,让我们导入必要的模块并创建一个 AsyncWebCrawler 实例。我们将使用一个异步上下文管理器,它为我们处理爬虫的初始化和清理工作。

python
import asyncio
from crawl4ai import AsyncWebCrawler

async def main():
    async with AsyncWebCrawler(verbose=True) as crawler:
        # 在这里添加我们的爬取代码
        pass

if __name__ == "__main__":
    asyncio.run(main())

基本使用

只需提供一个 URL,让 Crawl4AI 施展魔法!

python
async def main():
    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(url="https://www.nbcnews.com/business")
        print(f"基本爬取结果:{result.markdown[:500]}")  # 打印前500个字符

asyncio.run(main())

截图 📸

轻松捕获网页截图:

python
async def capture_and_save_screenshot(url: str, output_path: str):
    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(
            url=url,
            screenshot=True,
            bypass_cache=True
        )

        if result.success and result.screenshot:
            import base64
            screenshot_data = base64.b64decode(result.screenshot)
            with open(output_path, 'wb') as f:
                f.write(screenshot_data)
            print(f"截图保存成功到 {output_path}")
        else:
            print("截图失败")

asyncio.run(capture_and_save_screenshot("https://www.example.com", "screenshot.png"))

浏览器选择 🌐

Crawl4AI 支持多种浏览器引擎。以下是使用不同浏览器的示例:

python
# 使用 Firefox
async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless=True) as crawler:
    result = await crawler.arun(url="https://www.example.com", bypass_cache=True)

# 使用 WebKit
async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless=True) as crawler:
    result = await crawler.arun(url="https://www.example.com", bypass_cache=True)

# 使用 Chromium(默认)
async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
    result = await crawler.arun(url="https://www.example.com", bypass_cache=True)

用户模拟 🎭

模拟真实用户行为以避免检测:

python
async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
    result = await crawler.arun(
        url="YOUR-URL-HERE",
        bypass_cache=True,
        simulate_user=True,  # 导致随机的鼠标移动和点击
        override_navigator=True  # 使浏览器看起来更像一个真实用户
    )

理解参数 🧠

默认情况下,Crawl4AI 会缓存您的爬取结果。这意味着对同一 URL 的后续爬取会快得多!让我们看看它是如何工作的。

python
async def main():
    async with AsyncWebCrawler(verbose=True) as crawler:
        # 第一次爬取(缓存结果)
        result1 = await crawler.arun(url="https://www.nbcnews.com/business")
        print(f"第一次爬取结果:{result1.markdown[:100]}...")

        # 强制再次爬取
        result2 = await crawler.arun(url="https://www.nbcnews.com/business", bypass_cache=True)
        print(f"第二次爬取结果:{result2.markdown[:100]}...")

asyncio.run(main())

添加分块策略 🧩

让我们添加一个分块策略:RegexChunking!这个策略会根据给定的正则表达式模式分割文本。

python
from crawl4ai.chunking_strategy import RegexChunking

async def main():
    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(
            url="https://www.nbcnews.com/business",   
            chunking_strategy=RegexChunking(patterns=["\n\n"])
        )
        print(f"RegexChunking 结果:{result.extracted_content[:200]}...")

asyncio.run(main())

使用不同的 LLM 提取策略 🤖

Crawl4AI 支持多个 LLM 提供商进行提取:

python
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from pydantic import BaseModel, Field

class OpenAIModelFee(BaseModel):
    model_name: str = Field(..., description="OpenAI 模型的名称。")
    input_fee: str = Field(..., description="OpenAI 模型的输入令牌费用。")
    output_fee: str = Field(..., description="OpenAI 模型的输出令牌费用。")

# OpenAI
await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))

# Hugging Face
await extract_structured_data_using_llm(
    "huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", 
    os.getenv("HUGGINGFACE_API_KEY")
)

# Ollama
await extract_structured_data_using_llm("ollama/llama3.2")

# 使用自定义头
custom_headers = {
    "Authorization": "Bearer your-custom-token",
    "X-Custom-Header": "Some-Value"
}
await extract_structured_data_using_llm(extra_headers=custom_headers)

知识图谱生成 🕸️

从网络内容生成知识图谱:

python
from pydantic import BaseModel
from typing import List

class Entity(BaseModel):
    name: str
    description: str

class Relationship(BaseModel):
    entity1: Entity
    entity2: Entity
    description: str
    relation_type: str

class KnowledgeGraph(BaseModel):
    entities: List[Entity]
    relationships: List[Relationship]

extraction_strategy = LLMExtractionStrategy(
    provider='openai/gpt-4o-mini',
    api_token=os.getenv('OPENAI_API_KEY'),
    schema=KnowledgeGraph.model_json_schema(),
    extraction_type="schema",
    instruction="从给定文本中提取实体和关系。"
)

async with AsyncWebCrawler() as crawler:
    result = await crawler.arun(
        url="https://paulgraham.com/love.html",   
        bypass_cache=True,
        extraction_strategy=extraction_strategy
    )

高级基于会话的爬取与动态内容 🔄

对于具有动态内容加载的现代 Web 应用程序,以下是如何处理分页和内容更新的方法:

python
async def crawl_dynamic_content():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://github.com/microsoft/TypeScript/commits/main"
        session_id = "typescript_commits_session"

        js_next_page = """
        const button = document.querySelector('a[data-testid="pagination-next-button"]');
        if (button) button.click();
        """

        wait_for = """() => {
            const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
            if (commits.length === 0) return false;
            const firstCommit = commits[0].textContent.trim();
            return firstCommit !== window.firstCommit;
        }"""

        schema = {
            "name": "Commit Extractor",
            "baseSelector": "li.Box-sc-g0xbh4-0",
            "fields": [
                {
                    "name": "title",
                    "selector": "h4.markdown-title",
                    "type": "text",
                    "transform": "strip",
                },
            ],
        }
        extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)

        for page in range(3):  # 爬取 3 页
            result = await crawler.arun(
                url=url,
                session_id=session_id,
                css_selector="li.Box-sc-g0xbh4-0",
                extraction_strategy=extraction_strategy,
                js_code=js_next_page if page > 0 else None,
                wait_for=wait_for if page > 0 else None,
                js_only=page > 0,
                bypass_cache=True,
                headless=False,
            )

        await crawler.crawler_strategy.kill_session(session_id)

处理覆盖层和适应内容 📏

移除覆盖元素并适当适应内容:

python
async with AsyncWebCrawler(headless=False) as crawler:
    result = await crawler.arun(
        url="your-url-here",
        bypass_cache=True,
        word_count_threshold=10,
        remove_overlay_elements=True,
        screenshot=True
    )