import os
import urllib.parse
import requests
from pyquery import PyQuery as pq
import json
import re
import pymongo
import time
from multiprocessing import Pool
import threading
keywords="美女" #input('请输入想要搜索的关键字:')
def get_tuji_url(offset,keywords): #获取网页内容包含各个图集的链接
urllist=[]
try:
keyword_encode=urllib.parse.quote(keywords)
url="https://www.toutiao.com/search_content/?offset=%d&format=json&keyword=%s&autoload/" \
"=true&count=20&cur_tab=3&from=gallery" % (offset,keyword_encode)
response=requests.get(url)
html=response.content.decode("utf-8")
if response.status_code==200:
html=json.loads(html)
for each in html["data"]:
urllist.append(r"https://www.toutiao.com/"+'a'+each['article_url'].split('/')[-2])
return urllist
except TypeError as e:
print('1:'+e)
def get_pic_url(url): #获取每个图集中所有照片的网站
pic_urllist=[]
try:
response=requests.get(url)
content=response.content.decode("utf-8")
doc=pq(content)
if response.status_code==200:
title = doc('title').text()
pic_url=re.search(r'gallery: JSON.parse(.*\s*)*?sib',content).group()
pic_urllist1=re.search(r'{(.*\s*)*}',pic_url).group().replace('\\','')
pic_urllist_all=re.findall(r'http://.*?"',pic_urllist1)
for each in pic_urllist_all:
each_url="".join([r'http://p3.pstatp.com/origin/pgc-image/',each.split('/')[-1][:-1]])
if each_url not in pic_urllist:
pic_urllist.append(each_url)
return title,pic_urllist
except TypeError as e:
print('2:'+e)
def save_pic(content):#将图片保存在本地文件夹
try:
path = content[0]
if not os.path.exists(path):
os.mkdir(path)
os.chdir(path)
num=0
for each in content[1]:
content = requests.get(each).content
with open('%d.jpg' % num,'wb') as f:
f.write(content)
f.close()
num+=1
os.chdir(os.pardir)
except TypeError as e:
print(e)
def main(offset):
urllist=get_tuji_url(offset,keywords)
for each in urllist:
content=get_pic_url(each)
#save_url(content)
save_pic(content)
if __name__=="__main__":
[color=Red] timestart = time.time()
for offset in range(0, 20 + 1, 20): # 使用多线程的时候,文件夹嵌套会有所紊乱,需要后期解决这个问题
t=threading.Thread(target=main,args=(offset,))
t.start()
#t.join()
timeend= time.time()
print('总耗时为:%0.3f'%(timeend-timestart))[/color]