继上一篇亚马逊bestsellers的爬取优化。主要是开启多线程以及自动重新爬取失败的链接。
但还是有问题,就是如果那个链接本身就有问题,那爬虫就会一直爬下去,得手动停止。但问题不大,只要确保爬取的链接正确,就行。emmm....不想改了
bestsellers不用selenium,可以稍微控制下速度,有问题欢迎各位同学指教哈
- 生产者线程主要是去爬beistseller链接
- item消费者解析数据并保存
- 图片线程下载图片
有个问题,就是保存item数据的时候本来是想生成器然后返回数据,再建个函数保存的。
可是线程里面run()函数是start()函数调用的,start()函数无返回值,重写start()函数又怕出什么错,有朋友知道怎么解决吗
import requests
import csv
from bs4 import BeautifulSoup
import pandas as pd
import queue
import threading
class ProducerAmazon(threading.Thread):
def __init__(self, url_queue, items_queue, headers, *args, **kwargs):
super(ProducerAmazon, self).__init__(*args, **kwargs)
self.headers = headers
self.url_queue = url_queue
self.items_queue = items_queue
def run(self):
while not self.url_queue.empty():
url_obj = self.url_queue.get()
for category, url in url_obj.items():
category = category
url = url
try:
r = requests.get(url, headers=self.headers)
r.raise_for_status()
# r.encoding是根据header得来的,r.apparent_encoding是分析内容得来的。此处用哪个其实都可以
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, 'lxml')
items = soup.select("#zg-ordered-list > li.zg-item-immersion")
# 爬取不到信息,假如queue列,重新爬取
if len(items) < 1:
print(r.status_code, items)
self.url_queue.put({category: url})
else:
self.items_queue.put({category: items})
print("成功")
# 爬取出错,假如queue列,重新爬取
except Exception as e:
self.url_queue.put({category: url})
print("%s爬取失败" % url, e)
print("%s线程执行完成" % threading.current_thread().name)
class ConsumerItem(threading.Thread):
def __init__(self, items_queue, image_queue, *args, **kwargs):
super(ConsumerItem, self).__init__(*args, **kwargs)
self.items_queue = items_queue
self.image_queue = image_queue
def run(self):
while True:
data_all = pd.DataFrame()
item_obj = self.items_queue.get(timeout=20)
for category, items in item_obj.items():
for item in items:
# print(item)
try:
asin = item.select("a.a-link-normal")[0].get("href").split("/")[3]
except Exception as e:
asin = None
print(e)
# title的class样式会改变,目前发现的是这两个
if item.select("div.p13n-sc-truncate"):
title = item.select("div.p13n-sc-truncate")[0].text.strip()
elif item.select("div.p13n-sc-truncated"):
title = item.select("div.p13n-sc-truncated")[0].text.strip()
else:
title = None
print("没有title")
item_img = item.select("div.a-section > img")[0].get("src") if item.select(
"div.a-section > img") else None
content = pd.DataFrame({
'catelogy': category,
"asin": asin,
"item_img": item_img,
"item_rank": item.select("span.zg-badge-text")[0].text if item.select(
"span.zg-badge-text") else None,
"item_title": title,
"item_url": item.select("a.a-link-normal")[0].get("href") if item.select(
"a.a-link-normal") else None,
"item_star": item.select("span.a-icon-alt")[0].text if item.select("span.a-icon-alt") else None,
"item_number": item.select("a.a-size-small")[0].text if item.select("a.a-size-small") else None,
"item_review_url": "https://www.amazon.com" + item.select("a.a-size-small")[0].get(
"href") if item.select("a.a-size-small") else None,
"price": item.select("span.a-size-base")[0].text if item.select("span.a-size-base") else None,
}, index=[0])
data_all = pd.concat([data_all, content])
self.image_queue.put({asin: item_img})
data_all.to_csv('asin.csv', mode='a')
class ConsumerImage(threading.Thread):
def __init__(self, image_queue, headers, *args, **kwargs):
super(ConsumerImage, self).__init__(*args, **kwargs)
self.image_queue = image_queue
self.headers = headers
def run(self):
while True:
img_obj = self.image_queue.get(timeout=20)
for asin, item_img in img_obj.items():
try:
img = requests.get(item_img, heaers=self.headers)
with open(r"D:\python\asin_img\{}.jpg".format(asin), "wb") as fp:
fp.write(img.content)
except Exception as e:
print(asin, e)
print("%s线程执行完成" % threading.current_thread().name)
def main():
url_queue = queue.Queue(1000)
items_queue = queue.Queue(100000)
image_queue = queue.Queue(100000)
headers = {'user-agent': 'Mozilla/5.0'}
# 读取需要爬取的类目链接
urls_df = pd.read_excel(r'D:\类目链接.xlsx')
for x, y in zip(urls_df['爬取链接'], urls_df['目录']):
url_queue.put({y: x})
for x in range(4):
th = ProducerAmazon(url_queue, items_queue, headers, name="生产者线程%d" % x)
th.start()
for x in range(1):
th = ConsumerItem(items_queue, image_queue)
th.start()
for x in range(2):
th = ConsumerImage(image_queue, headers, name="图片消费者线程%d" % x)
th.start()
if __name__ == "__main__":
main()