Some checks failed
Build and Upload / Explore-Gitea-Actions (push) Failing after 2m54s
77 lines
1.9 KiB
Markdown
77 lines
1.9 KiB
Markdown
---
|
||
title: 【代码】随机爬虫
|
||
description: 一个可以在网上随机乱爬的简单爬虫
|
||
date: 2025-06-12
|
||
image: randomspider.png
|
||
keywords:
|
||
- 爬虫
|
||
categories:
|
||
- 代码
|
||
tags:
|
||
- code
|
||
- spider
|
||
---
|
||
|
||
爬虫功能:
|
||
|
||
1. 模拟火狐浏览器的UA
|
||
2. 从hao123开始爬取
|
||
3. 提取当前目标上的外链,存入一个大小为500的、去重的缓存
|
||
4. 完成后从缓存中取一个地址,作为下一个目标
|
||
|
||
|
||
```py
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
import time
|
||
import random
|
||
|
||
# 设置User-Agent为火狐浏览器
|
||
headers = {
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0'
|
||
}
|
||
|
||
# 初始化缓存
|
||
cache = set()
|
||
|
||
# 初始站点
|
||
start_url = 'https://www.hao123.com'
|
||
|
||
# 重试次数
|
||
retry_times = 5
|
||
|
||
# 爬虫函数
|
||
def crawl(url):
|
||
global cache
|
||
for i in range(retry_times):
|
||
try:
|
||
response = requests.get(url, headers=headers, timeout=10)
|
||
if response.status_code == 200:
|
||
print(f'访问成功: {url}')
|
||
# 解析页面提取外链
|
||
soup = BeautifulSoup(response.text, 'html.parser')
|
||
for link in soup.find_all('a', href=True):
|
||
href = link['href']
|
||
if href.startswith('http') and len(cache) < 500:
|
||
cache.add(href)
|
||
return
|
||
else:
|
||
print(f'访问失败: {url}')
|
||
return
|
||
except Exception as e:
|
||
if i == retry_times - 1:
|
||
print(f'访问失败: {url}')
|
||
time.sleep(random.uniform(1, 3)) # 随机等待1到3秒后重试
|
||
|
||
# 主函数
|
||
def main():
|
||
global cache
|
||
cache.add(start_url)
|
||
while cache:
|
||
target_url = cache.pop()
|
||
crawl(target_url)
|
||
time.sleep(random.uniform(1, 3)) # 随机等待1到3秒后访问下一个
|
||
|
||
if __name__ == '__main__':
|
||
main()
|
||
``` |