python多进程爬取安居客

# -*- coding: utf-8 -*-
"""
Created on Sat Jan 20 18:08:21 2018

@author: Administrator
"""

import requests
from bs4 import BeautifulSoup
import time
from multiprocessing import Pool
headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
        }
def get_loupan(url):
    try:
        res=requests.get(url,headers=headers)
        soup=BeautifulSoup(res.text,'html.parser')
        titles=soup.find_all('span',class_='items-name')
        title=list(map(lambda x:x.text,titles))
        dizhis=soup.find_all('span',class_='list-map')
        dizhi=list(map(lambda x:x.text,dizhis))
        diqus=soup.find_all('span',class_='list-map')
        diqu=list(map(lambda x:x.text.split('\xa0')[1],diqus))
        mianjis_quan=soup.find_all('a',class_='huxing')
        mianji_quan=list(map(lambda x:x.text,mianjis_quan))
        mianjis=soup.find_all('a',class_='huxing')
        mianji=list(map(lambda x:x.text.split('\t')[-1].strip(),mianjis))
        jiages=soup.find_all('a',class_='favor-pos')
        jiage=list(map(lambda x:x.p.text,jiages))
        for tit,dizhi,diqu,mianq,mianj,jiage in zip(title,dizhi,diqu,mianji_quan,mianji,jiage):
            info={'标题':tit,
                  '地址':dizhi,
                  '地区':diqu,
                  '面积(全)':mianq,
                  '面积':mianj,
                  '价格':jiage}
            return info
    except:
        return ''
if __name__ == '__main__':
    start_1=time.time()
    for i in range(1,11):
        url='https://sh.fang.anjuke.com/loupan/all/p{}/'.format(i)
        get_loupan(url)
        time.sleep(1)
    end_1=time.time()
    print('串行爬虫:',end_1 - start_1)
    start_2=time.time()
    pool=Pool(processes=2)
    for i in range(1,11):
        url='https://sh.fang.anjuke.com/loupan/all/p{}/'.format(i)
        pool.map(get_loupan,url)
        time.sleep(1)
    end_2=time.time()
    print('两个进程:',end_2 - start_2)
    start_3=time.time()
    pool=Pool(processes=4)
    for i in range(1,11):
        url='https://sh.fang.anjuke.com/loupan/all/p{}/'.format(i)
        pool.map(get_loupan,url)
        time.sleep(1)
    end_3=time.time()
    print('四个进程:',end_3 - start_3)
    start_4=time.time()
    pool=Pool(processes=4)
    urls=[]
    for i in range(1,11):
        url='https://sh.fang.anjuke.com/loupan/all/p{}/'.format(i)
        urls.append(url)
    pool.map(get_loupan,urls)
    time.sleep(1)
    end_4=time.time()
    print('(四)个进程:',end_4 - start_4) 

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值