Crawl4AI 快速入门指南
Crawl4AI 快速入门指南
开始 🛠️
首先,让我们导入必要的模块并创建一个 AsyncWebCrawler
实例。我们将使用一个异步上下文管理器,它为我们处理爬虫的初始化和清理工作。
python
import asyncio
from crawl4ai import AsyncWebCrawler
async def main():
async with AsyncWebCrawler(verbose=True) as crawler:
# 在这里添加我们的爬取代码
pass
if __name__ == "__main__":
asyncio.run(main())
基本使用
只需提供一个 URL,让 Crawl4AI 施展魔法!
python
async def main():
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(url="https://www.nbcnews.com/business")
print(f"基本爬取结果:{result.markdown[:500]}") # 打印前500个字符
asyncio.run(main())
截图 📸
轻松捕获网页截图:
python
async def capture_and_save_screenshot(url: str, output_path: str):
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url=url,
screenshot=True,
bypass_cache=True
)
if result.success and result.screenshot:
import base64
screenshot_data = base64.b64decode(result.screenshot)
with open(output_path, 'wb') as f:
f.write(screenshot_data)
print(f"截图保存成功到 {output_path}")
else:
print("截图失败")
asyncio.run(capture_and_save_screenshot("https://www.example.com", "screenshot.png"))
浏览器选择 🌐
Crawl4AI 支持多种浏览器引擎。以下是使用不同浏览器的示例:
python
# 使用 Firefox
async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless=True) as crawler:
result = await crawler.arun(url="https://www.example.com", bypass_cache=True)
# 使用 WebKit
async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless=True) as crawler:
result = await crawler.arun(url="https://www.example.com", bypass_cache=True)
# 使用 Chromium(默认)
async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
result = await crawler.arun(url="https://www.example.com", bypass_cache=True)
用户模拟 🎭
模拟真实用户行为以避免检测:
python
async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
result = await crawler.arun(
url="YOUR-URL-HERE",
bypass_cache=True,
simulate_user=True, # 导致随机的鼠标移动和点击
override_navigator=True # 使浏览器看起来更像一个真实用户
)
理解参数 🧠
默认情况下,Crawl4AI 会缓存您的爬取结果。这意味着对同一 URL 的后续爬取会快得多!让我们看看它是如何工作的。
python
async def main():
async with AsyncWebCrawler(verbose=True) as crawler:
# 第一次爬取(缓存结果)
result1 = await crawler.arun(url="https://www.nbcnews.com/business")
print(f"第一次爬取结果:{result1.markdown[:100]}...")
# 强制再次爬取
result2 = await crawler.arun(url="https://www.nbcnews.com/business", bypass_cache=True)
print(f"第二次爬取结果:{result2.markdown[:100]}...")
asyncio.run(main())
添加分块策略 🧩
让我们添加一个分块策略:RegexChunking
!这个策略会根据给定的正则表达式模式分割文本。
python
from crawl4ai.chunking_strategy import RegexChunking
async def main():
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business",
chunking_strategy=RegexChunking(patterns=["\n\n"])
)
print(f"RegexChunking 结果:{result.extracted_content[:200]}...")
asyncio.run(main())
使用不同的 LLM 提取策略 🤖
Crawl4AI 支持多个 LLM 提供商进行提取:
python
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from pydantic import BaseModel, Field
class OpenAIModelFee(BaseModel):
model_name: str = Field(..., description="OpenAI 模型的名称。")
input_fee: str = Field(..., description="OpenAI 模型的输入令牌费用。")
output_fee: str = Field(..., description="OpenAI 模型的输出令牌费用。")
# OpenAI
await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
# Hugging Face
await extract_structured_data_using_llm(
"huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct",
os.getenv("HUGGINGFACE_API_KEY")
)
# Ollama
await extract_structured_data_using_llm("ollama/llama3.2")
# 使用自定义头
custom_headers = {
"Authorization": "Bearer your-custom-token",
"X-Custom-Header": "Some-Value"
}
await extract_structured_data_using_llm(extra_headers=custom_headers)
知识图谱生成 🕸️
从网络内容生成知识图谱:
python
from pydantic import BaseModel
from typing import List
class Entity(BaseModel):
name: str
description: str
class Relationship(BaseModel):
entity1: Entity
entity2: Entity
description: str
relation_type: str
class KnowledgeGraph(BaseModel):
entities: List[Entity]
relationships: List[Relationship]
extraction_strategy = LLMExtractionStrategy(
provider='openai/gpt-4o-mini',
api_token=os.getenv('OPENAI_API_KEY'),
schema=KnowledgeGraph.model_json_schema(),
extraction_type="schema",
instruction="从给定文本中提取实体和关系。"
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://paulgraham.com/love.html",
bypass_cache=True,
extraction_strategy=extraction_strategy
)
高级基于会话的爬取与动态内容 🔄
对于具有动态内容加载的现代 Web 应用程序,以下是如何处理分页和内容更新的方法:
python
async def crawl_dynamic_content():
async with AsyncWebCrawler(verbose=True) as crawler:
url = "https://github.com/microsoft/TypeScript/commits/main"
session_id = "typescript_commits_session"
js_next_page = """
const button = document.querySelector('a[data-testid="pagination-next-button"]');
if (button) button.click();
"""
wait_for = """() => {
const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
if (commits.length === 0) return false;
const firstCommit = commits[0].textContent.trim();
return firstCommit !== window.firstCommit;
}"""
schema = {
"name": "Commit Extractor",
"baseSelector": "li.Box-sc-g0xbh4-0",
"fields": [
{
"name": "title",
"selector": "h4.markdown-title",
"type": "text",
"transform": "strip",
},
],
}
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
for page in range(3): # 爬取 3 页
result = await crawler.arun(
url=url,
session_id=session_id,
css_selector="li.Box-sc-g0xbh4-0",
extraction_strategy=extraction_strategy,
js_code=js_next_page if page > 0 else None,
wait_for=wait_for if page > 0 else None,
js_only=page > 0,
bypass_cache=True,
headless=False,
)
await crawler.crawler_strategy.kill_session(session_id)
处理覆盖层和适应内容 📏
移除覆盖元素并适当适应内容:
python
async with AsyncWebCrawler(headless=False) as crawler:
result = await crawler.arun(
url="your-url-here",
bypass_cache=True,
word_count_threshold=10,
remove_overlay_elements=True,
screenshot=True
)