--- title: 【代码】随机爬虫 description: 一个可以在网上随机乱爬的简单爬虫 date: 2025-06-12 image: randomspider.png keywords: - 爬虫 categories: - 代码 tags: - code - spider --- 爬虫功能: 1. 模拟火狐浏览器的UA 2. 从hao123开始爬取 3. 提取当前目标上的外链,存入一个大小为500的、去重的缓存 4. 完成后从缓存中取一个地址,作为下一个目标 ```py import requests from bs4 import BeautifulSoup import time import random # 设置User-Agent为火狐浏览器 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0' } # 初始化缓存 cache = set() # 初始站点 start_url = 'https://www.hao123.com' # 重试次数 retry_times = 5 # 爬虫函数 def crawl(url): global cache for i in range(retry_times): try: response = requests.get(url, headers=headers, timeout=10) if response.status_code == 200: print(f'访问成功: {url}') # 解析页面提取外链 soup = BeautifulSoup(response.text, 'html.parser') for link in soup.find_all('a', href=True): href = link['href'] if href.startswith('http') and len(cache) < 500: cache.add(href) return else: print(f'访问失败: {url}') return except Exception as e: if i == retry_times - 1: print(f'访问失败: {url}') time.sleep(random.uniform(1, 3)) # 随机等待1到3秒后重试 # 主函数 def main(): global cache cache.add(start_url) while cache: target_url = cache.pop() crawl(target_url) time.sleep(random.uniform(1, 3)) # 随机等待1到3秒后访问下一个 if __name__ == '__main__': main() ```