Python爬虫实战⑥|反爬虫应对策略,IP代理+随机延迟+UA伪装
·
author: 专注Python实战,分享爬虫与数据分析干货
title: Python爬虫实战⑥|反爬虫应对策略,IP代理+随机延迟+UA伪装
update: 2026-04-26
tags: Python,爬虫,反爬虫,IP代理,User-Agent,随机延迟,请求伪装
作者:专注Python实战,分享爬虫与数据分析干货
更新时间:2026年4月
适合人群:已掌握爬虫基础、遇到403/429/封IP问题的开发者
前言:为什么你的爬虫总被网站发现?
爬虫跑着跑着突然403了?抓了几页就IP被封?返回一堆验证码?
网站的反爬系统在盯着你。 它们通过这些特征识别爬虫:
- 同一个IP短时间大量请求
- User-Agent是默认的python-requests/3.x
- 请求间隔太规律(每次精确间隔1秒)
- 没有Cookie、没有Referer
- 同一时间段请求频率异常
今天教你5大反反爬策略,让爬虫"看起来像人"。
一、User-Agent伪装
1.1 为什么需要伪装UA?
import requests
# 默认的UA(一看就是爬虫)
response = requests.get("https://httpbin.org/user-agent")
print(response.json())
# {'user-agent': 'python-requests/2.31.0'} ← 网站一看就知道是爬虫!
1.2 UA池随机轮换
import requests
import random
USER_AGENTS = [
# Chrome
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
# Firefox
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0",
# Edge
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0",
# Safari
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
]
def get_random_ua():
"""随机获取一个User-Agent"""
return random.choice(USER_AGENTS)
# 每次请求随机切换UA
for i in range(5):
headers = {"User-Agent": get_random_ua()}
response = requests.get("https://httpbin.org/user-agent", headers=headers, timeout=10)
ua = response.json()["user-agent"]
print(f"请求{i+1}: {ua[:50]}...")
运行效果:
请求1: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537...
请求2: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKi...
请求3: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/...
请求4: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537...
请求5: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKi...
1.3 用fake-useragent库自动获取
# pip install fake-useragent
from fake_useragent import UserAgent
ua = UserAgent()
# 随机浏览器UA
print(ua.random)
print(ua.chrome) # Chrome专用
print(ua.firefox) # Firefox专用
print(ua.safari) # Safari专用
# 在爬虫中使用
headers = {"User-Agent": ua.random}
response = requests.get(url, headers=headers)
二、随机延迟——模拟人类行为
2.1 固定延迟 vs 随机延迟
import time
import random
# 固定延迟(太规律,容易被检测)
time.sleep(2) # 每次精确等2秒
# 随机延迟(更像人类行为)
delay = random.uniform(1, 5) # 随机1~5秒
time.sleep(delay)
2.2 智能延迟策略
import time
import random
def smart_delay(page, base_delay=2):
"""智能延迟:页数越大,间隔越长"""
# 基础延迟 + 随机波动 + 递增
delay = base_delay + random.uniform(0, 2) + (page * 0.3)
# 偶尔模拟用户"思考"(长时间停留)
if random.random() < 0.1: # 10%概率
delay += random.uniform(5, 15)
print(f" ☕ 模拟休息 {delay:.1f}s")
return delay
# 使用
for page in range(1, 11):
# ... 抓取逻辑 ...
delay = smart_delay(page)
print(f" 等待 {delay:.1f}s")
time.sleep(delay)
三、IP代理轮换
3.1 为什么需要代理IP?
网站通过IP识别爬虫:
- 同一IP短时间请求100次 → 封IP
- 即使换UA,IP还是同一个 → 封IP
- 只有换IP才能绕过
3.2 代理IP的获取方式
| 方式 | 免费 | 质量 | 适用场景 |
|---|---|---|---|
| 免费代理网站 | 免费 | 差(可用率<10%) | 学习测试 |
| 付费代理池 | 付费 | 好(可用率>90%) | 生产环境 |
| 自建代理 | 成本高 | 最好 | 大规模爬虫 |
| ADSL拨号换IP | 宽带费 | 好 | 小规模爬虫 |
3.3 代理IP实战
import requests
import random
import time
# 代理池(示例,实际使用请替换为有效代理)
PROXY_POOL = [
"http://1.2.3.4:8080",
"http://5.6.7.8:8080",
"http://9.10.11.12:8080",
# 更多代理...
]
def get_random_proxy():
"""随机获取一个代理"""
proxy = random.choice(PROXY_POOL)
return {"http": proxy, "https": proxy}
def test_proxy(proxy_dict, test_url="https://httpbin.org/ip"):
"""测试代理是否可用"""
try:
response = requests.get(test_url, proxies=proxy_dict, timeout=5)
if response.status_code == 200:
return True, response.json()["origin"]
except:
pass
return False, None
# 测试代理池
print("测试代理池...")
available = []
for proxy_url in PROXY_POOL:
proxy_dict = {"http": proxy_url, "https": proxy_url}
ok, ip = test_proxy(proxy_dict)
status = "✓" if ok else "✗"
print(f" {status} {proxy_url} → {ip or '不可用'}")
if ok:
available.append(proxy_url)
print(f"\n可用代理: {len(available)}/{len(PROXY_POOL)}")
3.4 带代理和重试的爬虫
import requests
import random
import time
class ProxyCrawler:
"""带代理轮换的爬虫"""
def __init__(self, proxy_pool=None):
self.session = requests.Session()
self.proxy_pool = proxy_pool or []
self.current_proxy = None
self.headers_pool = [
{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"},
{"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"},
{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0"},
]
def rotate_proxy(self):
"""轮换代理"""
if not self.proxy_pool:
return None
proxy_url = random.choice(self.proxy_pool)
self.current_proxy = {"http": proxy_url, "https": proxy_url}
return self.current_proxy
def rotate_headers(self):
"""轮换请求头"""
headers = random.choice(self.headers_pool)
self.session.headers.update(headers)
return headers
def fetch(self, url, max_retries=3, timeout=10):
"""带代理轮换和重试的请求"""
for attempt in range(max_retries):
try:
self.rotate_headers()
proxies = self.rotate_proxy()
response = self.session.get(
url,
proxies=proxies,
timeout=timeout
)
# 检查是否被反爬
if response.status_code == 429:
print(f" 被限速(429),换代理重试...")
continue
if response.status_code == 403:
print(f" 被拒绝(403),换代理重试...")
continue
response.raise_for_status()
return response
except requests.RequestException as e:
print(f" 第{attempt+1}次失败: {e}")
if attempt < max_retries - 1:
wait = (attempt + 1) * 2 + random.uniform(0, 2)
time.sleep(wait)
return None
# 使用
crawler = ProxyCrawler(proxy_pool=PROXY_POOL)
for page in range(1, 6):
url = f"https://example.com/list?page={page}"
print(f"抓取第{page}页...", end=" ")
response = crawler.fetch(url)
if response:
print(f"OK ({len(response.text)} bytes)")
else:
print("失败")
time.sleep(random.uniform(1, 3))
四、请求频率控制
4.1 令牌桶算法
import time
class RateLimiter:
"""令牌桶限流器"""
def __init__(self, rate=1.0, capacity=5):
"""
rate: 每秒生成令牌数(1.0 = 每秒1次请求)
capacity: 桶容量(允许短时间突发)
"""
self.rate = rate
self.capacity = capacity
self.tokens = capacity
self.last_time = time.time()
def acquire(self):
"""获取一个令牌,没有则等待"""
now = time.time()
elapsed = now - self.last_time
self.last_time = now
# 补充令牌
self.tokens = min(self.capacity, self.tokens + elapsed * self.rate)
if self.tokens < 1:
# 令牌不够,需要等待
wait_time = (1 - self.tokens) / self.rate
time.sleep(wait_time)
self.tokens = 0
else:
self.tokens -= 1
# 使用:每秒最多1次请求
limiter = RateLimiter(rate=1.0, capacity=3)
for i in range(10):
limiter.acquire()
print(f"请求 {i+1} @ {time.strftime('%H:%M:%S')}")
五、完整反反爬爬虫
把所有策略整合到一个类里:
import requests
import random
import time
from fake_useragent import UserAgent
class StealthCrawler:
"""隐身爬虫:反反爬策略全覆盖"""
def __init__(self, proxy_pool=None, min_delay=1, max_delay=3, rate_limit=1.0):
self.session = requests.Session()
self.ua = UserAgent()
self.proxy_pool = proxy_pool or []
self.min_delay = min_delay
self.max_delay = max_delay
self.request_count = 0
def _build_headers(self, referer=None):
"""构建随机请求头"""
headers = {
"User-Agent": self.ua.random,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
}
if referer:
headers["Referer"] = referer
return headers
def _get_proxy(self):
"""获取随机代理"""
if not self.proxy_pool:
return None
proxy_url = random.choice(self.proxy_pool)
return {"http": proxy_url, "https": proxy_url}
def fetch(self, url, referer=None, max_retries=3, timeout=15):
"""核心请求方法"""
for attempt in range(max_retries):
try:
# 随机延迟
delay = random.uniform(self.min_delay, self.max_delay)
# 偶尔长时间停顿
if self.request_count > 0 and random.random() < 0.1:
delay += random.uniform(5, 10)
time.sleep(delay)
headers = self._build_headers(referer)
proxies = self._get_proxy()
response = self.session.get(
url,
headers=headers,
proxies=proxies,
timeout=timeout
)
# 反爬检测
if response.status_code in [403, 429]:
wait = (attempt + 1) * 5 + random.uniform(0, 3)
print(f" ⚠ 状态码{response.status_code},等待{wait:.0f}s后重试")
time.sleep(wait)
continue
response.raise_for_status()
self.request_count += 1
return response
except requests.RequestException as e:
print(f" 重试 {attempt+1}/{max_retries}: {e}")
if attempt < max_retries - 1:
time.sleep((attempt + 1) * 2)
return None
# 使用示例
crawler = StealthCrawler(
min_delay=1,
max_delay=3,
rate_limit=1.0,
)
for page in range(1, 11):
url = f"https://example.com/list?page={page}"
referer = f"https://example.com/list?page={page-1}" if page > 1 else None
response = crawler.fetch(url, referer=referer)
if response:
print(f"第{page}页: OK ({len(response.text)} bytes)")
else:
print(f"第{page}页: 失败")
六、知识卡
| 策略 | 说明 | 难度 |
|---|---|---|
| UA伪装 | 随机切换User-Agent | ⭐ |
| 随机延迟 | 请求间隔随机化 | ⭐ |
| Referer伪装 | 模拟从站内跳转 | ⭐ |
| Cookie管理 | Session保持登录态 | ⭐⭐ |
| IP代理轮换 | 每次请求换IP | ⭐⭐ |
| 令牌桶限流 | 控制请求频率 | ⭐⭐ |
| 请求头完整化 | 补全Accept/Language等 | ⭐ |
| fake-useragent | 自动获取真实UA | ⭐ |
七、课后作业
必做题:
- 搭建一个UA池(至少10个不同的User-Agent)
- 实现随机延迟策略,让请求间隔在1-5秒之间随机
- 用httpbin.org测试你的请求头和代理是否生效
选做题:
- 实现一个完整的StealthCrawler类,包含UA轮换+代理+延迟
- 搭建一个免费代理IP池,自动检测可用性
完成作业的同学,把运行截图发到评论区!
反反爬的核心原则:让爬虫看起来像人。 UA随机、间隔随机、IP轮换、频率控制,缺一不可。
本篇要点:
- UA伪装(手动池 + fake-useragent库)
- 随机延迟(均匀分布 + 智能策略)
- IP代理轮换(代理池 + 可用性检测)
- 令牌桶限流
- 完整StealthCrawler类
下一篇学习XPath精准定位——比CSS选择器更强大的数据提取方式。
收藏 + 关注,专栏更新不迷路!
有问题欢迎评论区留言,大家一起讨论!
标签:Python | 反爬虫 | IP代理 | User-Agent | 随机延迟 | 请求伪装 | 爬虫进阶
更多推荐


所有评论(0)