add code random spider
Some checks failed
Build and Upload / Explore-Gitea-Actions (push) Failing after 2m54s
Some checks failed
Build and Upload / Explore-Gitea-Actions (push) Failing after 2m54s
This commit is contained in:
parent
1191dc088b
commit
0bb79131f1
77
content/post/random-spider/index.md
Normal file
77
content/post/random-spider/index.md
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
---
|
||||||
|
title: 【代码】随机爬虫
|
||||||
|
description: 一个可以在网上随机乱爬的简单爬虫
|
||||||
|
date: 2025-06-12
|
||||||
|
image: randomspider.png
|
||||||
|
keywords:
|
||||||
|
- 爬虫
|
||||||
|
categories:
|
||||||
|
- 代码
|
||||||
|
tags:
|
||||||
|
- code
|
||||||
|
- spider
|
||||||
|
---
|
||||||
|
|
||||||
|
爬虫功能:
|
||||||
|
|
||||||
|
1. 模拟火狐浏览器的UA
|
||||||
|
2. 从hao123开始爬取
|
||||||
|
3. 提取当前目标上的外链,存入一个大小为500的、去重的缓存
|
||||||
|
4. 完成后从缓存中取一个地址,作为下一个目标
|
||||||
|
|
||||||
|
|
||||||
|
```py
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import time
|
||||||
|
import random
|
||||||
|
|
||||||
|
# 设置User-Agent为火狐浏览器
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0'
|
||||||
|
}
|
||||||
|
|
||||||
|
# 初始化缓存
|
||||||
|
cache = set()
|
||||||
|
|
||||||
|
# 初始站点
|
||||||
|
start_url = 'https://www.hao123.com'
|
||||||
|
|
||||||
|
# 重试次数
|
||||||
|
retry_times = 5
|
||||||
|
|
||||||
|
# 爬虫函数
|
||||||
|
def crawl(url):
|
||||||
|
global cache
|
||||||
|
for i in range(retry_times):
|
||||||
|
try:
|
||||||
|
response = requests.get(url, headers=headers, timeout=10)
|
||||||
|
if response.status_code == 200:
|
||||||
|
print(f'访问成功: {url}')
|
||||||
|
# 解析页面提取外链
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
for link in soup.find_all('a', href=True):
|
||||||
|
href = link['href']
|
||||||
|
if href.startswith('http') and len(cache) < 500:
|
||||||
|
cache.add(href)
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
print(f'访问失败: {url}')
|
||||||
|
return
|
||||||
|
except Exception as e:
|
||||||
|
if i == retry_times - 1:
|
||||||
|
print(f'访问失败: {url}')
|
||||||
|
time.sleep(random.uniform(1, 3)) # 随机等待1到3秒后重试
|
||||||
|
|
||||||
|
# 主函数
|
||||||
|
def main():
|
||||||
|
global cache
|
||||||
|
cache.add(start_url)
|
||||||
|
while cache:
|
||||||
|
target_url = cache.pop()
|
||||||
|
crawl(target_url)
|
||||||
|
time.sleep(random.uniform(1, 3)) # 随机等待1到3秒后访问下一个
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
|
```
|
BIN
content/post/random-spider/randomspider.png
Normal file
BIN
content/post/random-spider/randomspider.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 782 KiB |
Loading…
x
Reference in New Issue
Block a user