python apply_async没有执行函数_进程池未执行apply_async中添加的函数就直接结束了...

本文介绍了一个Python脚本,该脚本利用BeautifulSoup和requests库从豆瓣网站抓取图书标签,并使用多进程Pool进行并行处理。脚本首先获取所有标签和对应的URL,然后对每个标签下的图书详情页进行爬取,将图书ID存储到MongoDB数据库中。过程中,如果遇到403错误,会更换代理IP并重试。
摘要由CSDN通过智能技术生成

from bs4 import BeautifulSoup

import random

import requests

import pymongo

import datetime

import random

import time

from multiprocessing import Pool

user_agents = [

'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 '

'Mobile/13B143 Safari/601.1]',

'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) '

'Chrome/48.0.2564.23 Mobile Safari/537.36',

'Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) '

'Chrome/48.0.2564.23 Mobile Safari/537.36']

heads = {

'User_Agent': random.choice(user_agents)

}

ipHeads = {

'Upgrade-Insecure-Requests':'1',

'User-Agent':random.choice(user_agents),

'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

'Referer':'http://www.xicidaili.com/nn/',

'Accept-Encoding':'gzip, deflate, sdch',

'Accept-Language':'zh-CN,zh;q=0.8',

}

class douban():

def __init__(self):

self.client = pymongo.MongoClient('localhost', 27017)

self.db = self.client['books']

self.tool = self.client['tool']

self.collectIp = self.tool['ip']

def getFromSQL(self):

item = self.collectIp.find_one({'http': 'http'})

proxies = {}

proxies[item['http']] = 'http://' + item['ip'] + ':' + item['port']

return proxies

def getAllTag(self):

ipDic = {}

url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'

proxies = self.getFromSQL()

s = requests.get(url,headers=heads,proxies=proxies)

if s.status_code == 403:

values = list(proxies.values())[0]

ip = values.split('//')[1].split(':')[0]

self.collect.remove({'ip': ip})

proxies = getFromSQL()

s = requests.get(url,headers=heads,proxies=proxies)

soup = BeautifulSoup(s.text,'lxml')

titleTags= soup.find_all('a', class_='tag-title-wrapper')

tagList = soup.find_all('table',class_='tagCol')

href = {}

titleList = []

i = 0

for titleTag in titleTags:

title = titleTag['name']

titleList.append(title)

trs = tagList[i].find_all('tr')

hreflist = []

for tr in trs:

hreflist.append(tr.td.a['href'])

href[title] = hreflist

i = i + 1

return titleList,href

def getAllBookUrl(self,title, hrefDic):

print('a')

collect = self.db[title]

for href in hrefDic[title]:

index = 0

while 1:

url = 'https://book.douban.com' + href +'?start='+ str(index) + '&type=T'

proxies = self.getFromSQL()

s = requests.get(url, headers=heads,proxies=proxies)

if s.status_code == 403:

values = list(proxies.values())[0]

ip = values.split('//')[1].split(':')[0]

collect.remove({'ip': ip})

proxies = self.getFromSQL()

s = requests.get(url,headers=heads,proxies=proxies)

html = s.text

soup = BeautifulSoup(html, 'lxml')

liList= soup.find_all('li',class_='subject-item')

if  len(liList):

for li in liList:

id = li.find('a')['href'][32:-1]

collect.insert({'bookId':id})

index += 20

time.sleep(3)

else:

break

if __name__== '__main__':

p = Pool(4)

a = douban()

titleList, hrefDic = a.getAllTag()

for i in range(len(titleList)):

print('开始爬取%s'%titleList[i])

p.apply_async(a.getAllBookUrl, args=(titleList[i],hrefDic))

p.close()

p.join()

# a = douban()

# titleList, hrefDic = a.getAllTag()

# a.getAllBookUrl(titleList[0],hrefDic)

# print('done')

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值