add code random spider
Some checks failed
Build and Upload / Explore-Gitea-Actions (push) Failing after 2m54s

This commit is contained in:
moonlightwatch 2025-06-12 16:10:38 +08:00
parent 1191dc088b
commit 0bb79131f1
2 changed files with 77 additions and 0 deletions

View File

@ -0,0 +1,77 @@
---
title: 【代码】随机爬虫
description: 一个可以在网上随机乱爬的简单爬虫
date: 2025-06-12
image: randomspider.png
keywords:
- 爬虫
categories:
- 代码
tags:
- code
- spider
---
爬虫功能:
1. 模拟火狐浏览器的UA
2. 从hao123开始爬取
3. 提取当前目标上的外链存入一个大小为500的、去重的缓存
4. 完成后从缓存中取一个地址,作为下一个目标
```py
import requests
from bs4 import BeautifulSoup
import time
import random
# 设置User-Agent为火狐浏览器
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0'
}
# 初始化缓存
cache = set()
# 初始站点
start_url = 'https://www.hao123.com'
# 重试次数
retry_times = 5
# 爬虫函数
def crawl(url):
global cache
for i in range(retry_times):
try:
response = requests.get(url, headers=headers, timeout=10)
if response.status_code == 200:
print(f'访问成功: {url}')
# 解析页面提取外链
soup = BeautifulSoup(response.text, 'html.parser')
for link in soup.find_all('a', href=True):
href = link['href']
if href.startswith('http') and len(cache) < 500:
cache.add(href)
return
else:
print(f'访问失败: {url}')
return
except Exception as e:
if i == retry_times - 1:
print(f'访问失败: {url}')
time.sleep(random.uniform(1, 3)) # 随机等待1到3秒后重试
# 主函数
def main():
global cache
cache.add(start_url)
while cache:
target_url = cache.pop()
crawl(target_url)
time.sleep(random.uniform(1, 3)) # 随机等待1到3秒后访问下一个
if __name__ == '__main__':
main()
```

Binary file not shown.

After

Width:  |  Height:  |  Size: 782 KiB