1.寻找数据
点击翻页后,在检查页面可以看到
请求方法采用POST方法,数据保存在json中.
每一页的数据是通过表单上current字段来传参的
2. 爬取代码
import requests
from concurrent.futures import ThreadPoolExecutor
from queue import Queue
import csv
url = "http://www.xinfadi.com.cn/getPriceData.html"
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/105.0.0.0 Safari/537.36'
}
threadPool_make = ThreadPoolExecutor(5)
threadPool_sel = ThreadPoolExecutor(10)
# 向消费者传递数据
queue = Queue(100)
# 生产者
def get_url(_data):
response = requests.post(url, headers=headers, data=_data)
for i in response.json()['list']:
msg = [i['prodCat'], i['prodName'], f"{i['lowPrice']}-{i['highPrice']}", i['avgPrice'], i['specInfo'],
i['pubDate']]
queue.put(msg)
datas = [{'limit': '20', 'current': f'{i}'} for i in range(1, 101)]
# 将生产者任务提交到生产者线程池上
for data in datas:
threadPool_make.submit(get_url, data)
# 消费者,保存队列中的数据
file = open('data.csv', 'w+', encoding='utf-8')
csv_hand = csv.writer(file)
csv_hand.writerow(['类型', '名称', '价格范围', '平均价格', '规格', '日期'])
def save_csv():
while True:
try:
msg = queue.get(timeout=3)
if not msg:
continue
csv_hand.writerow(msg)
print(f"INFO:{msg}写入完成。。。 队列剩下数据{queue.qsize()}个")
except Exception:
print("INFO:" + "数据写入完成")
file.close()
break
# 将消费者任务提交到消费者线程池上
threadPool_sel.submit(save_csv)
测试结果: