某房源网信息爬虫

import random
import re
import requests
import time
from lxml import etree
from pymongo import MongoClient
from multiprocessing import Pool
from requests.exceptions import RequestException


def get_response(url):
    '''
        给出url,获取网页响应
    '''
    user_agent = 'Mozilla/5.0 (Windows NT 6.'
    cookie = '5Od66EiO2sfsd‘
    try:
        response = requests.get(url, headers=headers).text
        return response
    except RequestException:
        return None

def get_one_page(response):
    '''
        利用正则匹配,发送请求,获取整页源代码
    '''
    pattern = re.compile(r'class="houselist-mod houselist-mod-new"(.*?)<div id="IFX_p937".*?>', re.S)
    item = pattern.findall(response)
    return item

def get_content(item):
    '''
        利用XPath,正则,解析网页内容
    '''
    selector = etree.HTML(item)  # 将源码转化为能被XPath匹配的格式
    title = selector.xpath('//div[@class="house-details"]/div[@class="house-title"]/a/@title')  # 返回为一列表
    pattern = re.compile(r'<div class="details-item">.*?<span>(.*?)</span>.*?class="spe-lines".*?<span>(.*?)</span>'
                         r'.*?class="spe-lines".*?<span>(.*?)</span>.*?class="spe-lines".*?<span>(.*?)</span>.*?'
                         r'class="brokername">.*?<div class="pro-price">.*?class="price-det".*?<strong>(.*?)</strong>'
                         r'.*?class="unit-price">(.*?)</span>', re.S)
    item1 = pattern.findall(item)
    address = selector.xpath('//div[@class="details-item"]/span[@class="comm-address"]/@title')
    for i, name, add in zip(item1, title, address):
        yield name, i[0], i[1], i[2], i[3], i[4]+'万', i[5], add


def write_to_DB(title, hx, size, lc, build, price, u_price, address):
    '''
        保存数据到mongoDB数据库
    '''
    client = MongoClient('localhost', 27017)  # 链接数据库
    db = client['House']
    db.House.save({"介绍": title, "户型": hx, "大小": size, "楼层": lc,
                     "年代": build, "价钱": price, "单价": u_price, "地址": address})

def main(page):
    url = 'https://beijing.anjuke.com/sale/p{num}/#filtersort'
    for p in [page]:
        url = url.format(num=p)
        response = get_response(url)
        time.sleep(random.randint(5, 20))
        one_page = get_one_page(response)
        title = get_content(str(one_page))
        for i in title:
            write_to_DB(i[0], i[1], i[2], i[3], i[4], i[5], i[6], i[7])
            print('正在爬取: %s' % i[0])


if __name__ == "__main__":
    '''
        开启5个进程加快信息爬取
    '''
    start_time = time.time()
    pool = Pool(processes=5)
    page = [x+1 for x in range(5)]
    pool.map(main, page)
    pool.close()            # 关闭进程池,表示不能在往进程池中添加进程
    pool.join()             # 等待进程池中的所有进程执行完毕,必须在close()之后调用
    end_time = time.time()
    print("所用时间: %d 秒" % (end_time - start_time))
    print("爬取结束!")

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值