学习爬虫也有段时间了,这次把一个月前写的爬虫给更新了一下加入多线程,其它复杂的地方也做了简化思路清晰一些了。
首先导入一些库
import requests
import re
from os import path,mkdir
from random import choice
from time import time,sleep
from bs4 import BeautifulSoup
from uuid import uuid4
from threading import Thread
from queue import Queue
from lxml.html import fromstring
接着是,获取用户输入的网页url,做些简单的判断,在将url列表传入main函数中
if __name__=='__main__':
dreferer='https://www.taobao.com/?spm=a230r.7195193.1581860521.1.566dfeb6APAvF9'
thread_max=10
q = Queue()
while True:
urls = input('请输入待爬取网址<url之间用逗号隔开>:')
if urls =='0':
break
elif 'set' in urls:
try:
thread_max = eval(urls[urls.find('t')+1:])
except:
print('输入错误请重试')
continue
elif 'all' in urls:
urls = urls.replace(':',':').replace(',',',')
urls = getallgoods(*urls[urls.find(':')+1:].strip().split(','))
else:
try:
urls = urls.replace(',',',').split(',')
aurls=[]
for u in urls:
aurls.append(PageUrl(u,dreferer))
urls=aurls
except:
print('输入有误请重试')
continue
main(urls)
在main函数中将商品页面的主要信息先获取下来,将图片的url装填到一个队列中,由于淘宝的反爬,我构造了url对象,具有url和reference属性,最后用多线程来获取队列中的任务
def main(url):
global q
for url in urls:
sleep(1)
try:html = gethtml(url.url,ref=url.ref).text
except:continue
soup = BeautifulSoup(html,'html.parser')
#商品标题
title = soup.find('h3' ,class_="tb-main-title").attrs['data-title']
root = './%s'%title+str(time())[-4:]
if not path.exists(root) :
mkdir(root)
if not path.exists(root+'/图片') :
mkdir(root+'/图片')
if not path.exists(root+'/图片/首图') :
mkdir(root+'/图片/首图')
if not path.exists(root+'/图片/颜色') :
mkdir(root+'/图片/颜色')
if not path.exists(root+'/图片/详情图') :
mkdir(root+'/图片/详情图')
if not path.exists(root+'/详情') :
mkdir(root+'/详情')
#获取详情
try:
detail = soup.find('ul',class_='attributes-list').text
with open(root+'/详情/详情.txt','w',encoding='utf-8') as f:
f.write(detail)
f.close()
print('详情解析成功!')
except:
print('详情解析失败!')
#获取首图
try:
jpgs = soup.find('ul', id="J_UlThumb", class_="tb-thumb tb-clearfix")
for each in jpgs('img'):
if 'http' in each.attrs['data-src']:
url_jpg = each.attrs['data-src'].replace('50x50','400x400')
else:
url_jpg = 'http:'+each.attrs['data-src'].replace('50x50','400x400')
u = Picture(url_jpg,'首图',root+'/图片/首图/'+str(uuid4())+url_jpg[-4:])
q.put(u)
print('主图解析成功!')
except:
print('主图解析失败')
#获取颜色
try:
color_list=[]
colors = soup.find('ul',class_="J_TSaleProp tb-img tb-clearfix")
for each in colors('a'):
try:
if 'http' in each.attrs['style']:
color_list.append(each.attrs['style'].replace('jpg_30x30','jpg_400x400').replace('background:url(','').replace(') center no-repeat;',''))
else:
color_list.append('http:'+each.attrs['style'].replace('jpg_30x30','jpg_400x400').replace('background:url(','').replace(') center no-repeat;',''))
except: pass
print('颜色展示解析成功!')
except:
print('颜色展示解析失败!')
for u in color_list:
u = Picture(u,'颜色',root+'/图片/颜色/'+str(uuid4())+u[-4:])
q.put(u)
#获取描述图片
try:
find_desc = re.compile(r'descUrl[ ]*:[^:]*:[^:]*:[ ]*\'([^\']*)\',')
desc_js = 'https:'+find_desc.search(html,re.M).group(1)
desc_html = gethtml(desc_js,ref=url.url)
find_dp = re.compile('src=\"(.*?\.jpg)\"')
for u in find_dp.findall(desc_html.text):
u = Picture(u,'详情图',root+'/图片/详情图/'+str(uuid4())+u[-4:])
q.put(u)
print('描述图获取成功')
except:
print('描述图获取失败')
#获取视频
try:
find_videoid=re.compile(r'"videoId":"\d{5,20}"')
videoid = find_videoid.search(html).group(0)[11:-1]
find_ownerid=re.compile(r'"videoOwnerId":"\d{5,20}"')
ownerid = find_ownerid.search(html).group(0)[16:-1]
videolink = 'https://cloud.video.taobao.com/play/u/%s/p/1/e/6/t/1/%s.mp4'%(ownerid,videoid)
u = Picture(videolink,'视频',root+'/图片/首图/主图视频'+videolink[-4:])
q.put(u)
print('视频解析成功!')
except:
print('视频解析成功!')
print('资源队列填充完毕!')
threads=[]
for t in range(thread_max):
t = MyThread('{}号线程'.format(t+1),threadfunc)
threads.append(t)
t.start()
print('线程数{}'.format(thread_max))
for t in threads:
t.join()
print('{}退出!'.format(t.name))
main中引用的函数如下
def gethtml(url,ref=None):
print('Download',url)
user_agents=['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362',
'Mozilla/5.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
]
kv={
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'pragma': 'no-cache',
'cookie':'lLtC1_=1; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; tk_trace=oTRxOWSBNwn9dPyorMJE%2FoPdY8zMG1aAN%2F0SGVHunXe1zncjzZgd4rdD%2FjCPEtps5h1DdNN9ZtGY%2FSXjB6pLKY7JP0duRYcn4QfhdfkaO73LC%2FZ%2BSm52%2BHihM%2BJ4oPPcIVRYd0R1ub7uC%2BaXeI34SoaPZqOZva2hBGHPQL%2BqYrEPq8PfThv54jYWXg%2BrMf165zLHvHzJbKtyKB8W%2BWMcSM8T2W4amIwMx7aaqh6zn90EuKolMosljXNUYIzp9k489WSmONfQRl2c4pI2hCLL5x87g1Sb2V75hIfp%2FyCFn1yQSLjFKglhwAJsc7ZDJO9mGKLf685wwmJtZqnd4kkSRMCaDmauW6eGMo3B2mKZ; _samesite_flag_=true; cookie2=12e97f3971fbc06584c7bdd035a19bc2; t=6b54384541f545975c14314b6ce2c499; _tb_token_=ee71e6f7e1eb5; enc=ICQVf4fgltTYaOxxdyGVpl2Brpo%2BqgAIdqDRdCux01c2Nrqk3%2FhIOl6aUfkKKWz%2FY%2Fiu%2FP9B%2F9l7xX9MK82npg%3D%3D; _m_h5_tk=503a8421e104d76703098a189a305df6_1591066051640; _m_h5_tk_enc=b45c31ef6c8fe607dc0e9462967140bd; tfstk=cQklBAGNZbP5XWPm1LwSxDLp3YAOZEszP5Fq3YgW79a-aSMViZ1V_lLVPruop61..; sgcookie=E6PHmdt%2FPsAEYwnepVjhZ; mt=ci=0_0; tracknick=; cna=2G1PFxSAs0ACAXufbrvks+6p; v=0; isg=BJCQS95b-sb2Y6bCmFcDX5hVYd7iWXSjZsQe0Iph0-uhxTBvMmjWM7n8nY0lFSx7; l=eBN5X7qRQ0UfCVe0BO5ZPurza77OiIRb4sPzaNbMiInca1rV1UjRnNQDFuODRdtjgtCe5etPOzL1BRLHR3Ap9xDDBYFinQFE3xvO.',
'referer': ref or dreferer,
'sec-fetch-dest': 'script',
'sec-fetch-mode': 'no-cors',
'sec-fetch-site': 'same-site',
'upgrade-insecure-requests': '1',
'User-Agent':choice(user_agents)
}
try:
r = requests.get(url,headers=kv,timeout=4)
r.encoding = r.apparent_encoding
r.raise_for_status()
except:
print('Download error')
r=None
return r
class MyThread(Thread):
def __init__(self,name,func):
super().__init__()
self.name = name
self.func = func
def run(self):
self.func()
class Picture():
def __init__(self,url,attr,pos):
self.attr = attr
self.url = url
self.pos = pos
class PageUrl():
def __init__(self,url,ref):
self.url = url
self.ref = ref
def threadfunc():
global q
while not q.empty():
url = q.get()
try:
with open(url.pos,'wb') as f:
r = gethtml(url.url)
f.write(r.content)
f.close()
print('一个{}下载完毕'.format(url.attr))
except Exception as e:
print(e)
del url
def getallgoods(url,refer):
global dreferer
dreferer = refer
aurls=[]
page = 1
mpage=1
while page<=mpage:
if 'pageNo' in url:
res = gethtml(url[:-2]+str(page))
else:
res = gethtml('{}&pageNo={}'.format(url,page))
ref = refer+'&pageNo={}'.format(page)
find_page = re.compile(r'(\d)/(\d)')
mpage = eval(find_page.search(res.text).group(2))
find_id = re.compile(r'sellerId=(\d+)&itemIds=([^&]+)&')
itemIds = find_id.search(res.text).group(2).split(',')
for i in itemIds:
u = 'https://item.taobao.com/item.htm?id={}'.format(i)
aurls.append(PageUrl(u,ref))
page+=1
return aurls