下面是一个从网易军事爬取数据的爬虫例子
#2293111-2314544,共21434条
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import json
new_data = []
k = 3000
for i in tqdm(range(2293111+3000,2314545)):
url = f"https://war.163.com/photoview/4T8E0001/{i}.html#p=DNFMCASE4T8E0001NOS"
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, "html.parser")
#获取description
meta_tag = soup.find("meta", {"name": "description"})
if meta_tag:
description = meta_tag["content"]
#print(description)
#获取source
textarea_tag = soup.find('textarea', {'name': 'gallery-data'})
if textarea_tag:
content = textarea_tag.text.strip()
# 对 JSON 数据进行预处理,忽略无效的转义字符
#content = content.encode('utf-8').decode('unicode_escape', 'ignore')
try:
data = json.loads(content)
except:
continue
source = data['info']['source']
#print(source)
k+=1
new_item = {
"id":k,
"text":description,
"source":source
}
new_data.append(new_item)
if k%10==0:
with open("./从网易军事爬取3000后.json", 'w', encoding='utf-8') as f: # 写入
json.dump(new_data, f, indent=4, ensure_ascii=False)