add code random spider

2025-06-12 16:10:38 +08:00 · 2025-06-12 16:10:38 +08:00 · 0bb79131f1
commit 0bb79131f1
parent 1191dc088b
2 changed files with 77 additions and 0 deletions
--- a/content/post/random-spider/index.md
+++ b/content/post/random-spider/index.md
@ -0,0 +1,77 @@
+---
+title: 【代码】随机爬虫
+description: 一个可以在网上随机乱爬的简单爬虫
+date: 2025-06-12
+image: randomspider.png
+keywords:
+    - 爬虫
+categories:
+    - 代码
+tags:
+    - code
+    - spider
+---
+
+爬虫功能：
+
+1. 模拟火狐浏览器的UA
+2. 从hao123开始爬取
+3. 提取当前目标上的外链，存入一个大小为500的、去重的缓存
+4. 完成后从缓存中取一个地址，作为下一个目标
+
+
+```py
+import requests
+from bs4 import BeautifulSoup
+import time
+import random
+
+# 设置User-Agent为火狐浏览器
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0'
+}
+
+# 初始化缓存
+cache = set()
+
+# 初始站点
+start_url = 'https://www.hao123.com'
+
+# 重试次数
+retry_times = 5
+
+# 爬虫函数
+def crawl(url):
+    global cache
+    for i in range(retry_times):
+        try:
+            response = requests.get(url, headers=headers, timeout=10)
+            if response.status_code == 200:
+                print(f'访问成功: {url}')
+                # 解析页面提取外链
+                soup = BeautifulSoup(response.text, 'html.parser')
+                for link in soup.find_all('a', href=True):
+                    href = link['href']
+                    if href.startswith('http') and len(cache) < 500:
+                        cache.add(href)
+                return
+            else:
+                print(f'访问失败: {url}')
+                return
+        except Exception as e:
+            if i == retry_times - 1:
+                print(f'访问失败: {url}')
+            time.sleep(random.uniform(1, 3))  # 随机等待1到3秒后重试
+
+# 主函数
+def main():
+    global cache
+    cache.add(start_url)
+    while cache:
+        target_url = cache.pop()
+        crawl(target_url)
+        time.sleep(random.uniform(1, 3))  # 随机等待1到3秒后访问下一个
+
+if __name__ == '__main__':
+    main()
+```
--- a/content/post/random-spider/randomspider.png
+++ b/content/post/random-spider/randomspider.png