爬虫与数据采集
用 Python 从网页和 API 中提取结构化数据——requests + BeautifulSoup 经典组合,加上 async 提升采集效率。
爬虫技术栈
graph TD
CRAWL[Python 爬虫] --> HTTP[HTTP 请求]
CRAWL --> PARSE[页面解析]
CRAWL --> STORE[数据存储]
HTTP --> REQ[requests / httpx]
HTTP --> ASYNC[异步批量]
PARSE --> BS4[BeautifulSoup]
PARSE --> XPATH[lxml XPath]
PARSE --> CSS[CSS选择器]
STORE --> CSV[CSV / JSON]
STORE --> DB[SQLite / PostgreSQL]
STORE --> PARQ[Parquet]
style CRAWL fill:#e3f2fd,stroke:#1565c0,stroke-width:2px
style PARSE fill:#c8e6c9,stroke:#388e3c,stroke-width:2px
BeautifulSoup 解析
"""
BeautifulSoup:HTML 解析利器
pip install beautifulsoup4 lxml
"""
from bs4 import BeautifulSoup
from dataclasses import dataclass, field
# 示例 HTML
html = """
<html>
<body>
<h1>Python 书籍推荐</h1>
<div class="book-list">
<div class="book" data-id="1">
<h2 class="title">Fluent Python</h2>
<span class="author">Luciano Ramalho</span>
<span class="price">¥89.00</span>
</div>
<div class="book" data-id="2">
<h2 class="title">Python Cookbook</h2>
<span class="author">David Beazley</span>
<span class="price">¥79.00</span>
</div>
</div>
</body>
</html>
"""
@dataclass
class BookInfo:
title: str
author: str
price: float
def parse_books(html_content: str) -> list[BookInfo]:
"""解析书籍列表"""
soup = BeautifulSoup(html_content, "lxml")
books = []
for div in soup.select("div.book"):
title = div.select_one("h2.title").get_text(strip=True)
author = div.select_one("span.author").get_text(strip=True)
price_text = div.select_one("span.price").get_text(strip=True)
price = float(price_text.replace("¥", "").replace(",", ""))
books.append(BookInfo(title=title, author=author, price=price))
return books
result = parse_books(html)
for book in result:
print(f" {book.title} by {book.author} - ¥{book.price}")
异步批量采集
"""
异步爬虫:高效批量采集
"""
import asyncio
import httpx
from dataclasses import dataclass, field
@dataclass
class AsyncCrawler:
"""异步爬虫框架"""
max_concurrent: int = 5
timeout: float = 10.0
results: list = field(default_factory=list)
async def fetch(self, client: httpx.AsyncClient, url: str) -> dict:
"""获取单个页面"""
try:
resp = await client.get(url, timeout=self.timeout)
return {
"url": url,
"status": resp.status_code,
"content": resp.text[:500],
}
except httpx.HTTPError as e:
return {"url": url, "status": 0, "error": str(e)}
async def crawl(self, urls: list[str]) -> list[dict]:
"""批量采集(带并发控制)"""
semaphore = asyncio.Semaphore(self.max_concurrent)
async def limited_fetch(client, url):
async with semaphore:
return await self.fetch(client, url)
async with httpx.AsyncClient() as client:
tasks = [limited_fetch(client, url) for url in urls]
self.results = await asyncio.gather(*tasks)
success = sum(1 for r in self.results if r.get("status") == 200)
print(f"完成: {success}/{len(urls)} 成功")
return self.results
# 使用
async def main():
crawler = AsyncCrawler(max_concurrent=3)
urls = [f"https://httpbin.org/get?page={i}" for i in range(10)]
results = await crawler.crawl(urls)
for r in results[:3]:
print(f" {r['url']} → {r['status']}")
# asyncio.run(main())
数据存储
"""
采集数据的存储方案
"""
import json
import csv
from pathlib import Path
from dataclasses import dataclass, asdict
@dataclass
class CrawledItem:
url: str
title: str
content: str
timestamp: str
def save_json(items: list[CrawledItem], path: str):
"""保存为 JSON"""
data = [asdict(item) for item in items]
Path(path).write_text(
json.dumps(data, ensure_ascii=False, indent=2),
encoding="utf-8",
)
def save_csv(items: list[CrawledItem], path: str):
"""保存为 CSV"""
if not items:
return
fieldnames = list(asdict(items[0]).keys())
with open(path, "w", newline="", encoding="utf-8-sig") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for item in items:
writer.writerow(asdict(item))
def append_jsonl(item: CrawledItem, path: str):
"""追加写入 JSONL(大数据集推荐)"""
with open(path, "a", encoding="utf-8") as f:
f.write(json.dumps(asdict(item), ensure_ascii=False) + "\n")
反爬与合规
| 策略 | 说明 | 实现方式 |
|---|---|---|
| 请求间隔 | 避免过快请求 | asyncio.sleep(1) |
| User-Agent | 模拟浏览器 | 设置 headers |
| robots.txt | 遵守站点规则 | 检查 /robots.txt |
| 重试机制 | 处理临时错误 | 指数退避 |
| 数据脱敏 | 不存储个人隐私 | 过滤 PII |
| 频率限制 | 控制并发数 | Semaphore |
本章小结
| 知识点 | 要点 |
|---|---|
| BeautifulSoup | CSS 选择器 + get_text 解析 |
| 异步爬虫 | httpx + Semaphore 并发控制 |
| 数据存储 | JSON / CSV / JSONL |
| 合规 | 遵守 robots.txt、控制频率 |
| 反爬应对 | 随机间隔、UA 轮换 |
延伸阅读:了解 Scrapy 框架,处理更复杂的分布式爬虫场景。