python-安居客-郑州二手房销售信息抓取

python版本:3.7功能描述抓取安居客-郑州各区域内二手房销售信息.代码

# -*- coding: utf-8 -*-
"""
@site: http://www.wangxiaofeng.site
"""
import urllib3
urllib3.disable_warnings()
import sqlite3
import random
import threading
from bs4 import BeautifulSoup

# Some User Agents
hds = [{'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}, \
       {
           'User-Agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'}, \
       {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}, \
       {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0'}, \
       {
           'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36'}, \
       {
           'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}, \
       {
           'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}, \
       {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'}, \
       {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}, \
       {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}, \
       {
           'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'}, \
       {'User-Agent': 'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'}, \
       {'User-Agent': 'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'}]


lock = threading.Lock()


class SQLiteWraper(object):
    """
    数据库的一个小封装,更好的处理多线程写入
    """

    def __init__(self, path, command='', *args, **kwargs):
        self.lock = threading.RLock()  # 锁
        self.path = path  # 数据库连接参数

        if command != '':
            conn = self.get_conn()
            cu = conn.cursor()
            cu.execute(command)

    def get_conn(self):
        conn = sqlite3.connect(self.path)  # ,check_same_thread=False)
        conn.text_factory = str
        return conn

    def conn_close(self, conn=None):
        conn.close()

    def conn_trans(func):
        def connection(self, *args, **kwargs):
            self.lock.acquire()
            conn = self.get_conn()
            kwargs['conn'] = conn
            rs = func(self, *args, **kwargs)
            self.conn_close(conn)
            self.lock.release()
            return rs

        return connection

    @conn_trans
    def execute(self, command, method_flag=0, conn=None):
        cu = conn.cursor()
        try:
            if not method_flag:
                cu.execute(command)
            else:
                cu.execute(command[0], command[1])
            conn.commit()
        except sqlite3.IntegrityError as e:
            # print e
            return -1
        except Exception as e:
            print
            e
            return -2
        return 0


def gen_ershoufang_insert_command(info_dict):
    """
    生成小区数据库插入命令
    """
    info_list = [u'小区名称', u'房屋户型', u'单价', u'位置', u'面积', u'首付', u'年代', u'朝向', u'月供', u'房屋类型', u'楼层', u'装修程度', u'产权年限',
                 u'电梯', u'房本年限', u'产权性质', u'唯一住房']
    t = []
    for il in info_list:
        if il in info_dict:
            t.append(info_dict[il])
        else:
            t.append('')
    t = tuple(t)
    commands = (r"insert into anjuhouse values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", t)
    # commands = (r"insert into anjuhouse values(?,?)", t)
    return commands


def ershoufang_spider(db_ershoufang, url_page):

    """
    爬取页面链接中的二手房信息
    """
    try:
        # print(url_page)
        http = urllib3.PoolManager()
        req = http.request('GET', url_page, headers=hds[random.randint(0, len(hds) - 1)])
        source_code = req.data
        plain_text = source_code.decode('utf-8')
        soup = BeautifulSoup(plain_text, "html.parser")
        # print(soup)
        # exit
        cj_list = soup.findAll('div', {'class': 'houseInfo-content'})
        info_dict = {}
        info_dict.update({u'小区名称': cj_list[0].get_text()})
        info_dict.update({u'房屋户型': cj_list[1].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'单价': cj_list[2].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'位置': cj_list[3].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'面积': cj_list[4].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'首付': cj_list[5].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'年代': cj_list[6].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'朝向': cj_list[7].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'月供': cj_list[8].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'房屋类型': cj_list[9].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'楼层': cj_list[10].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'装修程度': cj_list[11].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'产权年限': cj_list[12].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'电梯': cj_list[13].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'房本年限': cj_list[14].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'产权性质': cj_list[15].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        info_dict.update({u'唯一住房': cj_list[16].text.replace(" ", "").replace("\t", "").replace("\n", "").strip()})
        commands = gen_ershoufang_insert_command(info_dict)
        db_ershoufang.execute(commands, 1)
        # print(cj_list[0].get_text())
        # exit
    except (urllib3.exceptions.HTTPError, urllib3.exceptions.NewConnectionError) as e:
        print
        e
        exit(-1)
    except Exception as e:
        print
        e
        exit(-2)

def db_ershoufang_spider(db_ershoufang, page = 1):

    url = u"https://zhengzhou.anjuke.com/sale/p%d/" % page
    # exit(url)
    try:
        http = urllib3.PoolManager()
        req = http.request('GET', url, headers=hds[random.randint(0, len(hds) - 1)])
        source_code = req.data
        # exit(source_code)
        plain_text = source_code.decode('utf-8')
        soup = BeautifulSoup(plain_text, "html.parser")
        list = []
        for viewlist in soup.findAll('a', {'class': 'houseListTitle'}):
            list.append(viewlist.get('href'))
        threads = []
        print(list)
        for viewurl in list:
            t = threading.Thread(target=ershoufang_spider, args=(db_ershoufang, viewurl))
            threads.append(t)
        for t in threads:
            t.start()
        for t in threads:
            t.join()
    except (urllib3.exceptions.HTTPError, urllib3.exceptions.NewConnectionError) as e:
        print
        e
        exit(-3)
    except Exception as e:
        print
        e
        exit(-4)

if __name__ == "__main__":
    command = "create table if not exists anjuhouse (xiaoqu TEXT, huxing TEXT, danjia TEXT, weizhi TEXT, mianji TEXT, shoufu TEXT, niandai TEXT, chaoxiang TEXT, yuegong TEXT, leixing TEXT, louceng TEXT, zhuangxiu TEXT, chanquan TEXT, dianti TEXT, nianxian TEXT, xingzhi TEXT, weiyi TEXT)"
    # command = "create table if not exists anjuhouse (xiaoqu TEXT, huxing TEXT)"
    db_ershoufang = SQLiteWraper('anjuke-ershoufang.db', command)

    for page in range(1, 50):
        db_ershoufang_spider(db_ershoufang, page + 1)

结果版权声明文章涉及内容以及代码仅供个人学习使用,请勿用于商业用途.因个人使用不当给其他人员造成的损失以及生成的各法律纠纷,作者概不承担. 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值