python爬虫进程池_python爬虫之进程池爬虫(世纪佳缘案例)

from multiprocessing import Process,Queue

import requests

import re

from lxml.html import etree

import json

import time

from concurrent.futures import ProcessPoolExecutor

def down_load_page_data(req_url):

pattern = re.compile('.*?page=(\d+).*?city_id=(\d+).*?shop_id=(\d+)')

result = re.findall(pattern, req_url)[0]

DATE_SHOW_LOC = result[1]

DATE_SHOW_SHOP = result[2]

response = download_data(req_url,DATE_SHOW_LOC,DATE_SHOW_SHOP)

if response.status_code == 200:

# print(result)

current_page = int(result[0])

if current_page == 1:

data = {'page':current_page,'data':response.text}

with open(str(result[1])+'.html','w') as file:

file.write(response.text)

next_page = re.sub('page=\d+', 'page=' + str(current_page + 1), response.url)

print('正在获取第'+str(current_page+1)+'页',DATE_SHOW_LOC,DATE_SHOW_SHOP)

else:

data = {'page':current_page, 'data': response.text}

if current_page !=1:

if isinstance(json.loads(response.text),list):

next_page = re.sub('page=\d+','page='+str(current_page+1),response.url)

print('正在获取第' + str(current_page+1) + '页', DATE_SHOW_LOC, DATE_SHOW_SHOP)

else:

next_page = None

print(response.text)

print('已获取到' + str(current_page)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值