AI应用开发相关目录
本专栏包括AI应用开发相关内容分享,包括不限于AI算法部署实施细节、AI应用后端分析服务相关概念及开发技巧、AI应用后端应用服务相关概念及开发技巧、AI应用前端实现路径及开发技巧
适用于具备一定算法及Python使用基础的人群
- AI应用开发流程概述
- Visual Studio Code及Remote Development插件远程开发
- git开源项目的一些问题及镜像解决办法
- python实现UDP报文通信
- python实现日志生成及定期清理
- Linux终端命令Screen常见用法
- python实现redis数据存储
- python字符串转字典
- python实现文本向量化及文本相似度计算
- python对MySQL数据的常见使用
- 一文总结python的异常数据处理示例
- 基于selenium和bs4的通用数据采集技术(附代码)
- 基于python的知识图谱技术
- 一文理清python学习路径
- Linux、Git、Docker常用指令
- linux和windows系统下的python环境迁移
- linux下python服务定时(自)启动
- windows下基于python语言的TTS开发
- python opencv实现图像分割
- python使用API实现word文档翻译
- yolo-world:”目标检测届大模型“
- 爬虫进阶:多线程爬虫
文章目录
import asyncio
import json
import time
import random
from tqdm import tqdm
from fake_useragent import UserAgent
from curl_cffi.requests import AsyncSession
FILENAME = "data/data2.json"
# 设置请求头
ua = UserAgent()
headers = {
'User-Agent': ua.random,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'close',
'Upgrade-Insecure-Requests': '1',
}
async def fetch_data(session, page_id):
url = f"https://ms.app.jiaodong.net/public/index.php/pc/v1/askInfo/{page_id}"
retries = 3
for _ in range(retries):
try:
response = await session.get(url, headers=headers, impersonate='chrome')
if response.status_code == 200:
return response.json()
else:
print(f"Error {response.status_code} for page_id {page_id}")
except Exception as e:
print(f"Exception for page_id {page_id}: {e}")
await asyncio.sleep(random.uniform(1, 3))
return None
def save_data(data):
with open(FILENAME, 'a', encoding='utf-8') as f:
for item in data:
json.dump(item, f, ensure_ascii=False, indent=4)
f.write('\n')
async def fetch_and_store_data(start_id, end_id, max_concurrent_tasks=10):
all_data = []
async with AsyncSession() as session:
tasks = []
semaphore = asyncio.Semaphore(max_concurrent_tasks)
async def sem_fetch(page_id):
async with semaphore:
data = await fetch_data(session, page_id)
return data
for page_id in range(start_id, end_id + 1):
tasks.append(sem_fetch(page_id))
for task in asyncio.as_completed(tasks):
try:
data = await task
if data["code"] == 200:
all_data.append(data)
if len(all_data) >= 10:
save_data(all_data)
all_data = []
except Exception as e:
print(f"Exception occurred: {e}")
# 随机延迟,防止被ban
delay = random.uniform(0.02, 0.1)
await asyncio.sleep(delay)
# 最后一批数据保存
if all_data:
save_data(all_data)
if __name__ == "__main__":
import sys
# 隐藏警告
if sys.platform.startswith('win'):
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
end_id = 1242122 - 50
start_id = 927384
batch_size = 50
for end_id in tqdm(range(end_id, start_id - 1, -batch_size), desc=f"Fetching data(batch_size:{batch_size})", total=round((end_id-start_id)/batch_size)):
start_id = end_id - batch_size
asyncio.run(fetch_and_store_data(start_id, end_id, max_concurrent_tasks=10))
补充示例
import requests
from bs4 import BeautifulSoup
import json
import re
import multiprocessing
import xlwt
import time
def netease_spider(headers, news_class, i):
if i == 1:
url = "https://temp.163.com/special/00804KVA/cm_{0}.js?callback=data_callback".format(news_class)
else:
url = 'https://temp.163.com/special/00804KVA/cm_{0}_0{1}.js?callback=data_callback'.format(news_class, str(i))
pages = []
try:
response = requests.get(url, headers=headers).text
except:
print("当前主页面爬取失败")
return
start = response.index('[')
end = response.index('])') + 1
data = json.loads(response[start:end])
try:
for item in data:
title = item['title']
docurl = item['docurl']
label = item['label']
source = item['source']
doc = requests.get(docurl, headers=headers).text
soup = BeautifulSoup(doc, 'lxml')
news = soup.find_all('div', class_='post_body')[0].text
news = re.sub('\s+', '', news).strip()
pages.append([title, label, source, news])
time.sleep(3)
except:
print("当前详情页面爬取失败")
pass
return pages
def run(news_class, nums):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36'}
tmp_result = []
for i in range(1, nums + 1):
tmp = netease_spider(headers, news_class, i)
if tmp:
tmp_result.append(tmp)
return tmp_result
if __name__ == '__main__':
book = xlwt.Workbook(encoding='utf-8')
sheet = book.add_sheet('网易新闻数据')
sheet.write(0, 0, '文章标题')
sheet.write(0, 1, '文章标签')
sheet.write(0, 2, '文章来源')
sheet.write(0, 3, '文章内容')
news_calsses = {'guonei', 'guoji'}
nums = 3
index = 1
pool = multiprocessing.Pool(30)
for news_class in news_calsses:
result = pool.apply_async(run, (news_class, nums))
for pages in result.get():
for page in pages:
if page:
title, label, source, news = page
sheet.write(index, 0, title)
sheet.write(index, 1, label)
sheet.write(index, 2, source)
sheet.write(index, 3, news)
index += 1
pool.close()
pool.join()
print("共爬取{0}篇新闻".format(index))
book.save(u"网易新闻爬虫结果.xls")