python apply_async没有执行函数_进程池未执行apply_async中添加的函数就直接结束了...

最新推荐文章于 2023-06-26 20:08:47 发布

weixin_39904522

最新推荐文章于 2023-06-26 20:08:47 发布

阅读量420

点赞数

文章标签： python apply_async没有执行函数

本文链接：https://blog.csdn.net/weixin_39904522/article/details/111435611

版权

本文介绍了一个Python脚本，该脚本利用BeautifulSoup和requests库从豆瓣网站抓取图书标签，并使用多进程Pool进行并行处理。脚本首先获取所有标签和对应的URL，然后对每个标签下的图书详情页进行爬取，将图书ID存储到MongoDB数据库中。过程中，如果遇到403错误，会更换代理IP并重试。

摘要由CSDN通过智能技术生成

from bs4 import BeautifulSoup

import random

import requests

import pymongo

import datetime

import random

import time

from multiprocessing import Pool

user_agents = [

'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 '

'Mobile/13B143 Safari/601.1]',

'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) '

'Chrome/48.0.2564.23 Mobile Safari/537.36',

'Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) '

'Chrome/48.0.2564.23 Mobile Safari/537.36']

heads = {

'User_Agent': random.choice(user_agents)

}

ipHeads = {

'Upgrade-Insecure-Requests':'1',

'User-Agent':random.choice(user_agents),

'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

'Referer':'http://www.xicidaili.com/nn/',

'Accept-Encoding':'gzip, deflate, sdch',

'Accept-Language':'zh-CN,zh;q=0.8',

}

class douban():

def __init__(self):

self.client = pymongo.MongoClient('localhost', 27017)

self.db = self.client['books']

self.tool = self.client['tool']

self.collectIp = self.tool['ip']

def getFromSQL(self):

item = self.collectIp.find_one({'http': 'http'})

proxies = {}

proxies[item['http']] = 'http://' + item['ip'] + ':' + item['port']

return proxies

def getAllTag(self):

ipDic = {}

url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'

proxies = self.getFromSQL()

s = requests.get(url,headers=heads,proxies=proxies)

if s.status_code == 403:

values = list(proxies.values())[0]

ip = values.split('//')[1].split(':')[0]

self.collect.remove({'ip': ip})

proxies = getFromSQL()

s = requests.get(url,headers=heads,proxies=proxies)

soup = BeautifulSoup(s.text,'lxml')

titleTags= soup.find_all('a', class_='tag-title-wrapper')

tagList = soup.find_all('table',class_='tagCol')

href = {}

titleList = []

i = 0

for titleTag in titleTags:

title = titleTag['name']

titleList.append(title)

trs = tagList[i].find_all('tr')

hreflist = []

for tr in trs:

hreflist.append(tr.td.a['href'])

href[title] = hreflist

i = i + 1

return titleList,href

def getAllBookUrl(self,title, hrefDic):

print('a')

collect = self.db[title]

for href in hrefDic[title]:

index = 0

while 1:

url = 'https://book.douban.com' + href +'?start='+ str(index) + '&type=T'

proxies = self.getFromSQL()

s = requests.get(url, headers=heads,proxies=proxies)

if s.status_code == 403:

values = list(proxies.values())[0]

ip = values.split('//')[1].split(':')[0]

collect.remove({'ip': ip})

proxies = self.getFromSQL()

s = requests.get(url,headers=heads,proxies=proxies)

html = s.text

soup = BeautifulSoup(html, 'lxml')

liList= soup.find_all('li',class_='subject-item')

if len(liList):

for li in liList:

id = li.find('a')['href'][32:-1]

collect.insert({'bookId':id})

index += 20

time.sleep(3)

else:

break

if __name__== '__main__':

p = Pool(4)

a = douban()

titleList, hrefDic = a.getAllTag()

for i in range(len(titleList)):

print('开始爬取%s'%titleList[i])

p.apply_async(a.getAllBookUrl, args=(titleList[i],hrefDic))

p.close()

p.join()

# a = douban()

# titleList, hrefDic = a.getAllTag()

# a.getAllBookUrl(titleList[0],hrefDic)

# print('done')

weixin_39904522

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python apply_async没有执行函数_进程池未执行apply_async中添加的函数就直接结束了...

from bs4 import BeautifulSoupimport randomimport requestsimport pymongoimport datetimeimport randomimport timefrom multiprocessing import Pooluser_agents = ['Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 lik...
复制链接

扫一扫