好长时间没有更新,今天更新一次!!。
因为工作原因,一直在使用pyspider框架有半年没有用过requests了,知识点也忘记了很多,今天写了一个多线程爬取APP的图片时间的脚本,供大家参考!
import re, random, time, json, requests, datetime, os
from pyquery import PyQuery as pq
from multiprocessing import Pool
#detail_page函数是获取详情页的内容,当然有不同的数据解析包获取数。这个地方可以视情况而定
def detail_page(page_url):
res = requests.get(page_url)
res_dict = res.json()
for each in json.loads(res_dict['info']['image_list']):
data = {
"pid": str(time.time()).split('.')[0] + str(random.randint(10000, 99999)),
"task_id": 257609,
"clue_id": 437389,
"clue_name": '玩咖',
"company_id": 230433,
"url": page_url,
"pic_url": each,
"client_date": get_date2(res_dict['info']['mtime']),
"url_article_title": res_dict['info']['title'],
"url_article": pq(res_dict['info']['content'])('p').text(),
"is_cover": 0,
}
aa = {'resource': data}
d = json.dumps(aa)
try:
url = 'http://shijue.qingapi.cn/task_python/start'
r = requests.post(url, data={"data": d})
except:
filemame = os.path.join(os.path.abspath('.'), 'wanka_error')
with open(filemame,'a') as f:
f.write(d)
f.write('\n')
print(data)
#list_page函数是APP每个板块的内容,进行分页,再将获取到的详情页的url传给detail_page函数
def list_page1(pid):
for i in range(pid, pid+100):
print(i)
url = 'http://data.gm825.com/api/channel/recommendation?pn='+str(i)+'&cuid=FA2B688F603E1C48EE93CB8291D5A0D5&svr=2.0.0.5&vcode=20&ovr=8.0.0&device=HUAWEI_FRD-AL00&app_id=h9999j&channel_id=90001b&client_id=UKiUuZkYs%2BYRgWJPphEM7w%3D%3D'
res = requests.get(url)
res_dict = res.json()
for each in res_dict['list']:
page_url = 'http://data.gm825.com/api/article/getbyid?id={mid}&cuid=FA2B688F603E1C48EE93CB8291D5A0D5&svr=2.0.0.5&vcode=20&ovr=8.0.0&device=HUAWEI_FRD-AL00&app_id=h9999j&channel_id=90001b&client_id=UKiUuZkYs%2BYRgWJPphEM7w%3D%3D'.format(mid=str(each['module_id']))
try:
detail_page(page_url)
except:
pass
for j in range(pid,pid+100):
# print(j)
url='http://data.gm825.com/api/channel/mha?pn='+str(j)+'&cuid=E8F10259E26C5509EBCFDC46459812DB&svr=2.0.0.5&vcode=20&ovr=7.1.2&device=Meizu_M6+Note&app_id=h9999j&channel_id=90001a&client_id=%2BbVBZZb5aMV2hD%2FXTvehaQ%3D%3D'
rese=requests.get(url)
res_dict=rese.json()
#print(res_dict)
for each in res_dict['list']:
#print(each)
page_url = 'http://data.gm825.com/api/article/getbyid?id={mid}&cuid=E8F10259E26C5509EBCFDC46459812DB&svr=2.0.0.5&vcode=20&ovr=7.1.2&device=Meizu_M6+Note&app_id=h9999j&channel_id=90001a&client_id=%2BbVBZZb5aMV2hD%2FXTvehaQ%3D%3D'.format(mid=str(each['id']))
#print(page_url)
try:
detail_page(page_url)
except:
pass
for q in range(pid,pid+100):
url='http://data.gm825.com/api/channel/gallery?pn='+str(q)+'&cuid=E8F10259E26C5509EBCFDC46459812DB&svr=2.0.0.5&vcode=20&ovr=7.1.2&device=Meizu_M6+Note&app_id=h9999j&channel_id=90001a&client_id=%2BbVBZZb5aMV2hD%2FXTvehaQ%3D%3D'
rese=requests.get(url)
res_dict=rese.json()
for each in res_dict['list']:
page_url = 'http://data.gm825.com/api/gallery/getbyid?id={mid}&cuid=E8F10259E26C5509EBCFDC46459812DB&svr=2.0.0.5&vcode=20&ovr=7.1.2&device=Meizu_M6+Note&app_id=h9999j&channel_id=90001a&client_id=%2BbVBZZb5aMV2hD%2FXTvehaQ%3D%3D'.format(mid=str(each['id']))
try:
detail_page(page_url)
except:
pass
for e in range(pid,pid+100):
url='http://data.gm825.com/api/channel/mixture?pn='+str(e)+'&cuid=E8F10259E26C5509EBCFDC46459812DB&svr=2.0.0.5&vcode=20&ovr=7.1.2&device=Meizu_M6+Note&app_id=h9999j&channel_id=90001a&client_id=%2BbVBZZb5aMV2hD%2FXTvehaQ%3D%3D'
rese=requests.get(url)
res_dict=rese.json()
for each in res_dict['list']:
page_url = 'http://data.gm825.com/api/article/getbyid?id={mid}&cuid=E8F10259E26C5509EBCFDC46459812DB&svr=2.0.0.5&vcode=20&ovr=7.1.2&device=Meizu_M6+Note&app_id=h9999j&channel_id=90001a&client_id=%2BbVBZZb5aMV2hD%2FXTvehaQ%3D%3D'.format(mid=str(each['id']))
try:
detail_page(page_url)
except:
pass
if __name__ == '__main__':
p = Pool(10)#定义了10个进程的进程池
for i in range(10,70):
p.apply_async(list_page1, args=(i*100,))
p.close()
p.join()
代码就这些,如果有不懂的朋友可以加我Q353061949,我会给你讲解哦!