一个简单的安居客房屋信息爬虫2采用多线程爬虫,爬取的信息放在mongodb数据库中。
from lxml import etree
import requests
import time
import pymongo
from multiprocessing import Pool
client = pymongo.MongoClient('localhost', 27017)
mydb = client['mydb']
anjuke2 = mydb['anjuke2']
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
def get_info(url):
res = requests.get(url, headers=headers)
selector = etree.HTML(res.text)
infos = selector.xpath('//*[@id="houselist-mod-new"]/li')
for info in infos:
title = info.xpath('div[2]/div[1]/a/text()')[0].strip()
yangshi = info.xpath('div[2]/div[2]/span[1]/text()')[0]
mianji = info.xpath('div[2]/div[2]/span[2]/text()')[0]
niandai = info.xpath('div[2]/div[2]/span[4]/text()')
if len(niandai) == 0:
niandai = '无'
else:
niandai = niandai[0].strip()
dizhi = info.xpath('div[2]/div[3]/span/text()')[0].strip()
danjia = info.xpath('div[3]/span[2]/text()')[0]
zongjia1 = info.xpath('div[3]/span[1]/strong/text()')[0]
zongjia2 = info.xpath('div[3]/span[1]/text()')[0]
zongjia = zongjia1 + zongjia2
all_info_list = {
'标题': title,
'样式': yangshi,
'面积': mianji,
'年代': niandai,
'地址': dizhi,
'单价(元/平方)': danjia,
'总价(万元)': zongjia
}
anjuke2.insert_one(all_info_list)
time.sleep(1)
if __name__ == '__main__':
urls = ['https://suzhou.anjuke.com/sale/wuzhong-q-szyuexi/p{}'.format(str(i)) for i in range(1, 30)]
pool = Pool(processes=6)
start = time.time()
pool.map(get_info, urls)
end = time.time()
print('爬虫时间:', end - start)
下图为爬取的数据: