# -*- coding: utf-8 -*-
"""
Created on Sat Jan 20 22:16:40 2018
@author: Administrator
"""
import requests
from bs4 import BeautifulSoup
import time
import re
from multiprocessing import Pool
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
}
total=[]
def get_loupan(url):
res=requests.get(url,headers=headers)
soup=BeautifulSoup(res.text,'html.parser')
fang_all=soup.find_all('div',rel='nofollow')#每个房子所有信息标签
for fang_each in fang_all:
title=fang_each.find('span',class_='items-name').text.strip()
jiage=fang_each.find('p').text.strip()
dizhi=fang_each.find('span',class_='list-map').text.strip()
mianji=fang_each.find('a',class_='huxing')
if mianji == None:
mianji_1 = '尚未公开'#none值不能text
else:
mianji_1 = ''.join(re.split(r'\s+',mianji.text))
pinglun=fang_each.find('span',class_='list-dp')
'''
if pinglun == None:
pinglun_1 = ''
else:
pinglun_1 = pinglun.text
'''
pinglun_1 = '' if pinglun == None else pinglun.text
info={'标题':title,
'价格':jiage,
'地址':dizhi,
'面积':mianji_1,
'评论':pinglun_1}
total.append(info)
return total
if __name__ == '__main__':
urls=['https://sh.fang.anjuke.com/loupan/all/p{}/'.format(i) for i in range(1,25)]
start_1=time.time()
for url in urls:
get_loupan(url)
end_1 = time.time()
print('串行爬虫:',end_1 - start_1)
start_2=time.time()
pool = Pool(processes=2)
pool.map(get_loupan,urls)
end_2 = time.time()
print('2进程爬虫:',end_2 - start_2)
start_3=time.time()
pool = Pool(processes=4)
pool.map(get_loupan,urls)
end_3 = time.time()
print('4进程爬虫:',end_3 - start_3)
'''
import pandas as pd
df=pd.DataFrame(total)
df.to_excel('anjuke.xls')
'''
'''
import re
mianji_r=re.findall('<span>建筑面积:(.*?)</span>',res.text)
print(mianji_r)
'''
python爬取安居客(BeautifulSoup先找大盒子,在大盒子里找各个小盒子)
最新推荐文章于 2024-04-08 13:45:22 发布