1 min read222 words

爬虫与数据采集

用 Python 从网页和 API 中提取结构化数据——requests + BeautifulSoup 经典组合，加上 async 提升采集效率。

爬虫技术栈

graph TD CRAWL[Python 爬虫] --> HTTP[HTTP 请求] CRAWL --> PARSE[页面解析] CRAWL --> STORE[数据存储] HTTP --> REQ[requests / httpx] HTTP --> ASYNC[异步批量] PARSE --> BS4[BeautifulSoup] PARSE --> XPATH[lxml XPath] PARSE --> CSS[CSS选择器] STORE --> CSV[CSV / JSON] STORE --> DB[SQLite / PostgreSQL] STORE --> PARQ[Parquet] style CRAWL fill:#e3f2fd,stroke:#1565c0,stroke-width:2px style PARSE fill:#c8e6c9,stroke:#388e3c,stroke-width:2px

BeautifulSoup 解析

"""
BeautifulSoup：HTML 解析利器
pip install beautifulsoup4 lxml
"""
from bs4 import BeautifulSoup
from dataclasses import dataclass, field
# 示例 HTML
html = """
<html>
<body>
<h1>Python 书籍推荐</h1>
<div class="book-list">
<div class="book" data-id="1">
<h2 class="title">Fluent Python</h2>
<span class="author">Luciano Ramalho</span>
<span class="price">¥89.00</span>
</div>
<div class="book" data-id="2">
<h2 class="title">Python Cookbook</h2>
<span class="author">David Beazley</span>
<span class="price">¥79.00</span>
</div>
</div>
</body>
</html>
"""
@dataclass
class BookInfo:
title: str
author: str
price: float
def parse_books(html_content: str) -> list[BookInfo]:
"""解析书籍列表"""
soup = BeautifulSoup(html_content, "lxml")
books = []
for div in soup.select("div.book"):
title = div.select_one("h2.title").get_text(strip=True)
author = div.select_one("span.author").get_text(strip=True)
price_text = div.select_one("span.price").get_text(strip=True)
price = float(price_text.replace("¥", "").replace(",", ""))
books.append(BookInfo(title=title, author=author, price=price))
return books
result = parse_books(html)
for book in result:
print(f"  {book.title} by {book.author} - ¥{book.price}")

异步批量采集

"""
异步爬虫：高效批量采集
"""
import asyncio
import httpx
from dataclasses import dataclass, field
@dataclass
class AsyncCrawler:
"""异步爬虫框架"""
max_concurrent: int = 5
timeout: float = 10.0
results: list = field(default_factory=list)
async def fetch(self, client: httpx.AsyncClient, url: str) -> dict:
"""获取单个页面"""
try:
resp = await client.get(url, timeout=self.timeout)
return {
"url": url,
"status": resp.status_code,
"content": resp.text[:500],
}
except httpx.HTTPError as e:
return {"url": url, "status": 0, "error": str(e)}
async def crawl(self, urls: list[str]) -> list[dict]:
"""批量采集（带并发控制）"""
semaphore = asyncio.Semaphore(self.max_concurrent)
async def limited_fetch(client, url):
async with semaphore:
return await self.fetch(client, url)
async with httpx.AsyncClient() as client:
tasks = [limited_fetch(client, url) for url in urls]
self.results = await asyncio.gather(*tasks)
success = sum(1 for r in self.results if r.get("status") == 200)
print(f"完成: {success}/{len(urls)} 成功")
return self.results
# 使用
async def main():
crawler = AsyncCrawler(max_concurrent=3)
urls = [f"https://httpbin.org/get?page={i}" for i in range(10)]
results = await crawler.crawl(urls)
for r in results[:3]:
print(f"  {r['url']} → {r['status']}")
# asyncio.run(main())

数据存储

"""
采集数据的存储方案
"""
import json
import csv
from pathlib import Path
from dataclasses import dataclass, asdict
@dataclass
class CrawledItem:
url: str
title: str
content: str
timestamp: str
def save_json(items: list[CrawledItem], path: str):
"""保存为 JSON"""
data = [asdict(item) for item in items]
Path(path).write_text(
json.dumps(data, ensure_ascii=False, indent=2),
encoding="utf-8",
)
def save_csv(items: list[CrawledItem], path: str):
"""保存为 CSV"""
if not items:
return
fieldnames = list(asdict(items[0]).keys())
with open(path, "w", newline="", encoding="utf-8-sig") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for item in items:
writer.writerow(asdict(item))
def append_jsonl(item: CrawledItem, path: str):
"""追加写入 JSONL（大数据集推荐）"""
with open(path, "a", encoding="utf-8") as f:
f.write(json.dumps(asdict(item), ensure_ascii=False) + "\n")

反爬与合规

策略	说明	实现方式
请求间隔	避免过快请求	`asyncio.sleep(1)`
User-Agent	模拟浏览器	设置 headers
robots.txt	遵守站点规则	检查 `/robots.txt`
重试机制	处理临时错误	指数退避
数据脱敏	不存储个人隐私	过滤 PII
频率限制	控制并发数	Semaphore

本章小结

知识点	要点
BeautifulSoup	CSS 选择器 + get_text 解析
异步爬虫	httpx + Semaphore 并发控制
数据存储	JSON / CSV / JSONL
合规	遵守 robots.txt、控制频率
反爬应对	随机间隔、UA 轮换

延伸阅读：了解 Scrapy 框架，处理更复杂的分布式爬虫场景。