author: 专注Python实战,分享爬虫与数据分析干货
title: Python爬虫实战⑥|反爬虫应对策略,IP代理+随机延迟+UA伪装
update: 2026-04-26
tags: Python,爬虫,反爬虫,IP代理,User-Agent,随机延迟,请求伪装

作者:专注Python实战,分享爬虫与数据分析干货
更新时间:2026年4月
适合人群:已掌握爬虫基础、遇到403/429/封IP问题的开发者


前言:为什么你的爬虫总被网站发现?

爬虫跑着跑着突然403了?抓了几页就IP被封?返回一堆验证码?

网站的反爬系统在盯着你。 它们通过这些特征识别爬虫:

  • 同一个IP短时间大量请求
  • User-Agent是默认的python-requests/3.x
  • 请求间隔太规律(每次精确间隔1秒)
  • 没有Cookie、没有Referer
  • 同一时间段请求频率异常

今天教你5大反反爬策略,让爬虫"看起来像人"。


一、User-Agent伪装

1.1 为什么需要伪装UA?

import requests

# 默认的UA(一看就是爬虫)
response = requests.get("https://httpbin.org/user-agent")
print(response.json())
# {'user-agent': 'python-requests/2.31.0'}  ← 网站一看就知道是爬虫!

1.2 UA池随机轮换

import requests
import random

USER_AGENTS = [
    # Chrome
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    # Firefox
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0",
    # Edge
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0",
    # Safari
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
]

def get_random_ua():
    """随机获取一个User-Agent"""
    return random.choice(USER_AGENTS)

# 每次请求随机切换UA
for i in range(5):
    headers = {"User-Agent": get_random_ua()}
    response = requests.get("https://httpbin.org/user-agent", headers=headers, timeout=10)
    ua = response.json()["user-agent"]
    print(f"请求{i+1}: {ua[:50]}...")

运行效果:

请求1: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537...
请求2: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKi...
请求3: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/...
请求4: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537...
请求5: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKi...

1.3 用fake-useragent库自动获取

# pip install fake-useragent
from fake_useragent import UserAgent

ua = UserAgent()

# 随机浏览器UA
print(ua.random)
print(ua.chrome)   # Chrome专用
print(ua.firefox)  # Firefox专用
print(ua.safari)   # Safari专用

# 在爬虫中使用
headers = {"User-Agent": ua.random}
response = requests.get(url, headers=headers)

二、随机延迟——模拟人类行为

2.1 固定延迟 vs 随机延迟

import time
import random

# 固定延迟(太规律,容易被检测)
time.sleep(2)  # 每次精确等2秒

# 随机延迟(更像人类行为)
delay = random.uniform(1, 5)  # 随机1~5秒
time.sleep(delay)

2.2 智能延迟策略

import time
import random

def smart_delay(page, base_delay=2):
    """智能延迟:页数越大,间隔越长"""
    # 基础延迟 + 随机波动 + 递增
    delay = base_delay + random.uniform(0, 2) + (page * 0.3)
    # 偶尔模拟用户"思考"(长时间停留)
    if random.random() < 0.1:  # 10%概率
        delay += random.uniform(5, 15)
        print(f"  ☕ 模拟休息 {delay:.1f}s")
    return delay

# 使用
for page in range(1, 11):
    # ... 抓取逻辑 ...
    delay = smart_delay(page)
    print(f"  等待 {delay:.1f}s")
    time.sleep(delay)

三、IP代理轮换

3.1 为什么需要代理IP?

网站通过IP识别爬虫:

  • 同一IP短时间请求100次 → 封IP
  • 即使换UA,IP还是同一个 → 封IP
  • 只有换IP才能绕过

3.2 代理IP的获取方式

方式 免费 质量 适用场景
免费代理网站 免费 差(可用率<10%) 学习测试
付费代理池 付费 好(可用率>90%) 生产环境
自建代理 成本高 最好 大规模爬虫
ADSL拨号换IP 宽带费 小规模爬虫

3.3 代理IP实战

import requests
import random
import time

# 代理池(示例,实际使用请替换为有效代理)
PROXY_POOL = [
    "http://1.2.3.4:8080",
    "http://5.6.7.8:8080",
    "http://9.10.11.12:8080",
    # 更多代理...
]

def get_random_proxy():
    """随机获取一个代理"""
    proxy = random.choice(PROXY_POOL)
    return {"http": proxy, "https": proxy}

def test_proxy(proxy_dict, test_url="https://httpbin.org/ip"):
    """测试代理是否可用"""
    try:
        response = requests.get(test_url, proxies=proxy_dict, timeout=5)
        if response.status_code == 200:
            return True, response.json()["origin"]
    except:
        pass
    return False, None

# 测试代理池
print("测试代理池...")
available = []
for proxy_url in PROXY_POOL:
    proxy_dict = {"http": proxy_url, "https": proxy_url}
    ok, ip = test_proxy(proxy_dict)
    status = "✓" if ok else "✗"
    print(f"  {status} {proxy_url}{ip or '不可用'}")
    if ok:
        available.append(proxy_url)

print(f"\n可用代理: {len(available)}/{len(PROXY_POOL)}")

3.4 带代理和重试的爬虫

import requests
import random
import time

class ProxyCrawler:
    """带代理轮换的爬虫"""

    def __init__(self, proxy_pool=None):
        self.session = requests.Session()
        self.proxy_pool = proxy_pool or []
        self.current_proxy = None
        self.headers_pool = [
            {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"},
            {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"},
            {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0"},
        ]

    def rotate_proxy(self):
        """轮换代理"""
        if not self.proxy_pool:
            return None
        proxy_url = random.choice(self.proxy_pool)
        self.current_proxy = {"http": proxy_url, "https": proxy_url}
        return self.current_proxy

    def rotate_headers(self):
        """轮换请求头"""
        headers = random.choice(self.headers_pool)
        self.session.headers.update(headers)
        return headers

    def fetch(self, url, max_retries=3, timeout=10):
        """带代理轮换和重试的请求"""
        for attempt in range(max_retries):
            try:
                self.rotate_headers()
                proxies = self.rotate_proxy()

                response = self.session.get(
                    url,
                    proxies=proxies,
                    timeout=timeout
                )

                # 检查是否被反爬
                if response.status_code == 429:
                    print(f"  被限速(429),换代理重试...")
                    continue
                if response.status_code == 403:
                    print(f"  被拒绝(403),换代理重试...")
                    continue

                response.raise_for_status()
                return response

            except requests.RequestException as e:
                print(f"  第{attempt+1}次失败: {e}")
                if attempt < max_retries - 1:
                    wait = (attempt + 1) * 2 + random.uniform(0, 2)
                    time.sleep(wait)

        return None

# 使用
crawler = ProxyCrawler(proxy_pool=PROXY_POOL)

for page in range(1, 6):
    url = f"https://example.com/list?page={page}"
    print(f"抓取第{page}页...", end=" ")
    response = crawler.fetch(url)
    if response:
        print(f"OK ({len(response.text)} bytes)")
    else:
        print("失败")
    time.sleep(random.uniform(1, 3))

四、请求频率控制

4.1 令牌桶算法

import time

class RateLimiter:
    """令牌桶限流器"""

    def __init__(self, rate=1.0, capacity=5):
        """
        rate: 每秒生成令牌数(1.0 = 每秒1次请求)
        capacity: 桶容量(允许短时间突发)
        """
        self.rate = rate
        self.capacity = capacity
        self.tokens = capacity
        self.last_time = time.time()

    def acquire(self):
        """获取一个令牌,没有则等待"""
        now = time.time()
        elapsed = now - self.last_time
        self.last_time = now

        # 补充令牌
        self.tokens = min(self.capacity, self.tokens + elapsed * self.rate)

        if self.tokens < 1:
            # 令牌不够,需要等待
            wait_time = (1 - self.tokens) / self.rate
            time.sleep(wait_time)
            self.tokens = 0
        else:
            self.tokens -= 1

# 使用:每秒最多1次请求
limiter = RateLimiter(rate=1.0, capacity=3)

for i in range(10):
    limiter.acquire()
    print(f"请求 {i+1} @ {time.strftime('%H:%M:%S')}")

五、完整反反爬爬虫

把所有策略整合到一个类里:

import requests
import random
import time
from fake_useragent import UserAgent

class StealthCrawler:
    """隐身爬虫:反反爬策略全覆盖"""

    def __init__(self, proxy_pool=None, min_delay=1, max_delay=3, rate_limit=1.0):
        self.session = requests.Session()
        self.ua = UserAgent()
        self.proxy_pool = proxy_pool or []
        self.min_delay = min_delay
        self.max_delay = max_delay
        self.request_count = 0

    def _build_headers(self, referer=None):
        """构建随机请求头"""
        headers = {
            "User-Agent": self.ua.random,
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
            "Accept-Encoding": "gzip, deflate, br",
            "Connection": "keep-alive",
        }
        if referer:
            headers["Referer"] = referer
        return headers

    def _get_proxy(self):
        """获取随机代理"""
        if not self.proxy_pool:
            return None
        proxy_url = random.choice(self.proxy_pool)
        return {"http": proxy_url, "https": proxy_url}

    def fetch(self, url, referer=None, max_retries=3, timeout=15):
        """核心请求方法"""
        for attempt in range(max_retries):
            try:
                # 随机延迟
                delay = random.uniform(self.min_delay, self.max_delay)
                # 偶尔长时间停顿
                if self.request_count > 0 and random.random() < 0.1:
                    delay += random.uniform(5, 10)
                time.sleep(delay)

                headers = self._build_headers(referer)
                proxies = self._get_proxy()

                response = self.session.get(
                    url,
                    headers=headers,
                    proxies=proxies,
                    timeout=timeout
                )

                # 反爬检测
                if response.status_code in [403, 429]:
                    wait = (attempt + 1) * 5 + random.uniform(0, 3)
                    print(f"  ⚠ 状态码{response.status_code},等待{wait:.0f}s后重试")
                    time.sleep(wait)
                    continue

                response.raise_for_status()
                self.request_count += 1
                return response

            except requests.RequestException as e:
                print(f"  重试 {attempt+1}/{max_retries}: {e}")
                if attempt < max_retries - 1:
                    time.sleep((attempt + 1) * 2)

        return None

# 使用示例
crawler = StealthCrawler(
    min_delay=1,
    max_delay=3,
    rate_limit=1.0,
)

for page in range(1, 11):
    url = f"https://example.com/list?page={page}"
    referer = f"https://example.com/list?page={page-1}" if page > 1 else None
    response = crawler.fetch(url, referer=referer)
    if response:
        print(f"第{page}页: OK ({len(response.text)} bytes)")
    else:
        print(f"第{page}页: 失败")

六、知识卡

策略 说明 难度
UA伪装 随机切换User-Agent
随机延迟 请求间隔随机化
Referer伪装 模拟从站内跳转
Cookie管理 Session保持登录态 ⭐⭐
IP代理轮换 每次请求换IP ⭐⭐
令牌桶限流 控制请求频率 ⭐⭐
请求头完整化 补全Accept/Language等
fake-useragent 自动获取真实UA

七、课后作业

必做题:

  1. 搭建一个UA池(至少10个不同的User-Agent)
  2. 实现随机延迟策略,让请求间隔在1-5秒之间随机
  3. 用httpbin.org测试你的请求头和代理是否生效

选做题:

  1. 实现一个完整的StealthCrawler类,包含UA轮换+代理+延迟
  2. 搭建一个免费代理IP池,自动检测可用性

完成作业的同学,把运行截图发到评论区!


反反爬的核心原则:让爬虫看起来像人。 UA随机、间隔随机、IP轮换、频率控制,缺一不可。

本篇要点:

  • UA伪装(手动池 + fake-useragent库)
  • 随机延迟(均匀分布 + 智能策略)
  • IP代理轮换(代理池 + 可用性检测)
  • 令牌桶限流
  • 完整StealthCrawler类

下一篇学习XPath精准定位——比CSS选择器更强大的数据提取方式。

收藏 + 关注,专栏更新不迷路!

有问题欢迎评论区留言,大家一起讨论!


标签:Python | 反爬虫 | IP代理 | User-Agent | 随机延迟 | 请求伪装 | 爬虫进阶

Logo

脑启社区是一个专注类脑智能领域的开发者社区。欢迎加入社区,共建类脑智能生态。社区为开发者提供了丰富的开源类脑工具软件、类脑算法模型及数据集、类脑知识库、类脑技术培训课程以及类脑应用案例等资源。

更多推荐