import random
import re
import requests
import time
from lxml import etree
from pymongo import MongoClient
from multiprocessing import Pool
from requests.exceptions import RequestException
def get_response(url):
'''
给出url,获取网页响应
'''
user_agent = 'Mozilla/5.0 (Windows NT 6.'
cookie = '5Od66EiO2sfsd‘
try:
response = requests.get(url, headers=headers).text
return response
except RequestException:
return None
def get_one_page(response):
'''
利用正则匹配,发送请求,获取整页源代码
'''
pattern = re.compile(r'class="houselist-mod houselist-mod-new"(.*?)<div id="IFX_p937".*?>', re.S)
item = pattern.findall(response)
return item
def get_content(item):
'''
利用XPath,正则,解析网页内容
'''
selector = etree.HTML(item) # 将源码转化为能被XPath匹配的格式
title = selector.xpath('//div[@class="house-details"]/div[@class="house-title"]/a/@title') # 返回为一列表
pattern = re.compile(r'<div class="details-item">.*?<span>(.*?)</span>.*?class="spe-lines".*?<span>(.*?)</span>'
r'.*?class="spe-lines".*?<span>(.*?)</span>.*?class="spe-lines".*?<span>(.*?)</span>.*?'
r'class="brokername">.*?<div class="pro-price">.*?class="price-det".*?<strong>(.*?)</strong>'
r'.*?class="unit-price">(.*?)</span>', re.S)
item1 = pattern.findall(item)
address = selector.xpath('//div[@class="details-item"]/span[@class="comm-address"]/@title')
for i, name, add in zip(item1, title, address):
yield name, i[0], i[1], i[2], i[3], i[4]+'万', i[5], add
def write_to_DB(title, hx, size, lc, build, price, u_price, address):
'''
保存数据到mongoDB数据库
'''
client = MongoClient('localhost', 27017) # 链接数据库
db = client['House']
db.House.save({"介绍": title, "户型": hx, "大小": size, "楼层": lc,
"年代": build, "价钱": price, "单价": u_price, "地址": address})
def main(page):
url = 'https://beijing.anjuke.com/sale/p{num}/#filtersort'
for p in [page]:
url = url.format(num=p)
response = get_response(url)
time.sleep(random.randint(5, 20))
one_page = get_one_page(response)
title = get_content(str(one_page))
for i in title:
write_to_DB(i[0], i[1], i[2], i[3], i[4], i[5], i[6], i[7])
print('正在爬取: %s' % i[0])
if __name__ == "__main__":
'''
开启5个进程加快信息爬取
'''
start_time = time.time()
pool = Pool(processes=5)
page = [x+1 for x in range(5)]
pool.map(main, page)
pool.close() # 关闭进程池,表示不能在往进程池中添加进程
pool.join() # 等待进程池中的所有进程执行完毕,必须在close()之后调用
end_time = time.time()
print("所用时间: %d 秒" % (end_time - start_time))
print("爬取结束!")