多线程爬取安客居

import requests
from lxml import etree
import  threading
from  queue  import  Queue
import time
import re
import pymysql


def is_none(message_list):
    if message_list:
        return message_list[0]
    else:
        str = '无'
        return str
class  lianjia(threading.Thread):
    def  __init__(self):
        super().__init__()#继承父类


    def  run(self):
        while True:
            if q.empty():
                break
            try:
                city=q.get()
                self.shuju(city)
            except:
                pass
    def shuju(self,city):
        city_name = city.xpath('./text()')[0]
        city_url = 'https:' + city.xpath('./@href')[0]
        print(city_name, city_url)
        url = city_url + '/loupan'
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
        response1 = requests.get(url=url, headers=headers).text
        tree = etree.HTML(response1)
        page = tree.xpath('//div[@class="page-box"]/@data-total-count')[0]
        page = int(page)
        if page / 10 == page // 10:
            page_num = page // 10
        else:
            page_num = page // 10 + 1

        for i in range(1, page_num + 1):
            url = city_url + '/loupan/pg' + str(i) + '/'
            print(url)
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
            response = requests.get(url=url, headers=headers).text
            tree = etree.HTML(response)
            listpage_message = tree.xpath('//ul[@class="resblock-list-wrapper"]/li')
            for message in listpage_message:
                all_message_list = []
                image = message.xpath('./a/img/@data-original')
                image = is_none(image)
                all_message_list.append(image)
                building_name = message.xpath(
                    './div[@class="resblock-desc-wrapper"]/div[@class="resblock-name"]/a/text()')
                building_name = is_none(building_name)
                all_message_list.append(building_name)
                average_price1 = message.xpath(
                    './div[@class="resblock-desc-wrapper"]/div[@class="resblock-price"]/div[@class="main-price"]/span[@class="number"]/text()')
                average_price1 = is_none(average_price1)
                average_price2 = message.xpath(
                    './div[@class="resblock-desc-wrapper"]/div[@class="resblock-price"]/div[@class="main-price"]/span[@class="desc"]/text()')
                average_price2 = is_none(average_price2)
                average_price = average_price1 + average_price2.strip()
                all_message_list.append(average_price)
                building_area = message.xpath(
                    './div[@class="resblock-desc-wrapper"]/div[@class="resblock-area"]/span/text()')
                building_area = is_none(building_area)
                all_message_list.append(building_area)
                district = message.xpath(
                    './div[@class="resblock-desc-wrapper"]/div[@class="resblock-location"]/span[1]/text()')
                district = is_none(district)
                all_message_list.append(district)
                bussiness_district = message.xpath(
                    './div[@class="resblock-desc-wrapper"]/div[@class="resblock-location"]/span[2]/text()')
                bussiness_district = is_none(bussiness_district)
                all_message_list.append(bussiness_district)
                building_url = city_url + message.xpath(
                    './div[@class="resblock-desc-wrapper"]/div[@class="resblock-name"]/a/@href')[0]
                url=building_url
                headers = {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
                response = requests.get(url=url, headers=headers).text
                tree = etree.HTML(response)
                house_type = []
                house_orientation = []
                house_type_list = tree.xpath(
                    '//div[@class="mod-wrap"]/div/div[@data-index="0"]/div[@class="houselist"]/ul/li[@class="info-li"]')
                for house_types in house_type_list:
                    one_type = house_types.xpath('./p[@class="p1"]/text()')
                    one_type = is_none(one_type)
                    house_type.append(one_type)
                    orientation = house_types.xpath('./p[@class="p1"]/span[@class="p1-orientation "]/text()')
                    orientation = is_none(orientation)
                    house_orientation.append(orientation)
                if house_type == []:
                    house_type = '无'
                all_message_list.append(house_type)
                if house_orientation == []:
                    house_orientation = '无'
                all_message_list.append(house_orientation)
                house_img = []
                house_img_list = tree.xpath(
                    '//div[@class="mod-wrap"]/div/div[@data-index="0"]/div[@class="houselist"]/ul/li[@class="img-li"]')
                for house_imgs in house_img_list:
                    img = house_imgs.xpath('./img/@src')
                    img = is_none(img)
                    house_img.append(img)
                if house_img_list == []:
                    house_img = "无"
                all_message_list.append(house_img)
                user_comment = []
                user_comment_list = tree.xpath('//div[@class="list_box"]/ul[@class="list"]/li')
                for comment in user_comment_list:
                    usercomment = comment.xpath(
                        './div[@class="r_comment"]/div[@class="words-container"]/div[@class="words"]/text()')
                    usercomment = is_none(usercomment)
                    user_comment.append(usercomment)
                if user_comment == []:
                    user_comment = '无'
                all_message_list.append(user_comment)
                project_address = tree.xpath(
                    '//div[@class="mod-wrap"]/div/div[@class="box-loupan"]/p[2]/span[@class="label-val"]/text()')
                project_address = is_none(project_address)
                all_message_list.append(project_address)
                sales_offices_address = tree.xpath(
                    '//div[@class="mod-wrap"]/div/div[@class="box-loupan"]/p[3]/span[@class="label-val"]/text()')
                sales_offices_address = is_none(sales_offices_address)
                all_message_list.append(sales_offices_address)
                property_developer = tree.xpath(
                    '//div[@class="mod-wrap"]/div/div[@class="box-loupan"]/p[4]/span[@class="label-val"]/text()')
                property_developer = is_none(property_developer)
                all_message_list.append(property_developer)
                property_management_company = tree.xpath(
                    '//div[@class="mod-wrap"]/div/div[@class="box-loupan"]/p[5]/span[@class="label-val"]/text()')
                property_management_company = is_none(property_management_company)
                all_message_list.append(property_management_company)
                other_message_list = tree.xpath('//div[@class="mod-wrap"]/div/div[@class="box-loupan"]/ul//li')
                for other_message in other_message_list:
                    if other_message.xpath('./p/span[@class="label"]/text()') == ["最新开盘:"]:
                        the_latest_opening = other_message.xpath('./p/span[@class="label-val"]/text()')
                        the_latest_opening = is_none(the_latest_opening).strip()
                        all_message_list.append(the_latest_opening)
                    elif other_message.xpath('./p/span[@class="label"]/text()') == ["物业类型:"]:
                        property_type = other_message.xpath('./p/span[@class="label-val"]/text()')
                        property_type = is_none(property_type).strip()
                        all_message_list.append(property_type)
                    elif other_message.xpath('./p/span[@class="label"]/text()') == ["交房时间:"]:
                        handing_room_time = other_message.xpath('./p/span[@class="label-val"]/text()')
                        handing_room_time = is_none(handing_room_time).strip()
                        all_message_list.append(handing_room_time)
                    elif other_message.xpath('./p/span[@class="label"]/text()') == ["容积率:"]:
                        plot_ratio = other_message.xpath('./p/span[@class="label-val"]/text()')
                        plot_ratio = is_none(plot_ratio).strip()
                        all_message_list.append(plot_ratio)
                    elif other_message.xpath('./p/span[@class="label"]/text()') == ["产权年限:"]:
                        property_right_year = other_message.xpath('./p/span[@class="label-val"]/text()')
                        property_right_year = is_none(property_right_year).strip()
                        all_message_list.append(property_right_year)
                    elif other_message.xpath('./p/span[@class="label"]/text()') == ["绿化率:"]:
                        greening_rate = other_message.xpath('./p/span[@class="label-val"]/text()')
                        greening_rate = is_none(greening_rate).strip()
                        all_message_list.append(greening_rate)
                    elif other_message.xpath('./p/span[@class="label"]/text()') == ["规划户数:"]:
                        planning_num = other_message.xpath('./p/span[@class="label-val"]/text()')
                        planning_num = is_none(planning_num).strip()
                        all_message_list.append(planning_num)
                    elif other_message.xpath('./p/span[@class="label"]/text()') == ["物业费用:"]:
                        property_cost = other_message.xpath('./p/span[@class="label-val"]/text()')
                        property_cost = is_none(property_cost).strip()
                        try:
                            property_cost_pattern = re.compile('(\d+)')
                            property_cost = property_cost_pattern.findall(property_cost)[0]
                        except:
                            pass
                        all_message_list.append(property_cost)
                    elif other_message.xpath('./p/span[@class="label"]/text()') == ["车位情况:"]:
                        parking_situation = other_message.xpath('./p/span[@class="label-val"]/text()')
                        parking_situation = is_none(parking_situation).replace(' ', '')
                        parking_pattern = re.compile(r'(\d+)', re.S)
                        parking_situation_list = parking_pattern.findall(parking_situation)
                        parking_situation = 0
                        for i in parking_situation_list:
                            parking_situation += int(i)
                        parking_situation = str(parking_situation)
                        all_message_list.append(parking_situation)
                    elif other_message.xpath('./p/span[@class="label"]/text()') == ["供暖方式:"]:
                        heating_method = other_message.xpath('./p/span[@class="label-val"]/text()')
                        heating_method = is_none(heating_method).strip()
                        all_message_list.append(heating_method)
                    elif other_message.xpath('./p/span[@class="label"]/text()') == ["供水方式:"]:
                        water_supply_method = other_message.xpath('./p/span[@class="label-val"]/text()')
                        water_supply_method = is_none(water_supply_method).strip()
                        all_message_list.append(water_supply_method)
                    elif other_message.xpath('./p/span[@class="label"]/text()') == ["供电方式:"]:
                        electricity_supply_method = other_message.xpath('./p/span[@class="label-val"]/text()')
                        electricity_supply_method = is_none(electricity_supply_method).strip()
                        all_message_list.append(electricity_supply_method)
                    elif other_message.xpath('./p/span[@class="label"]/text()') == ["建筑类型:"]:
                        building_type = other_message.xpath('./p/span[@class="label-val"]/text()')
                        building_type = is_none(building_type).strip()
                        all_message_list.append(building_type)
                    elif other_message.xpath('./p/span[@class="label"]/text()') == ["嫌恶设施:"]:
                        disgust_facilities = other_message.xpath('./p/span[@class="label-val"]/text()')
                        disgust_facilities = is_none(disgust_facilities).strip()
                        all_message_list.append(disgust_facilities)
                    elif other_message.xpath('./p/span[@class="label"]/text()') == ["占地面积:"]:
                        cover_area = other_message.xpath('./p/span[@class="label-val"]/text()')
                        cover_area = is_none(cover_area).strip()[:-1]
                        all_message_list.append(cover_area)
                    elif other_message.xpath('./p/span[@class="label"]/text()') == ["建筑面积:"]:
                        building_area2 = other_message.xpath('./p/span[@class="label-val"]/text()')
                        building_area2 = is_none(building_area2).strip()[:-1]
                        all_message_list.append(building_area2)
                message_list=all_message_list
                connect = pymysql.connect(
                    host='localhost',
                    db='renting',
                    user='root',
                    password='root'
                )
                cursor = connect.cursor()
                image = str(message_list[0])
                building_name = str(message_list[1])
                average_price = str(message_list[2])
                building_area = str(message_list[3])
                district = str(message_list[4])
                bussiness_district = str(message_list[5])
                house_type = str(message_list[6])
                house_orientation = str(message_list[7])
                house_img = str(message_list[8])
                user_comment = str(message_list[9])
                project_address = str(message_list[10])
                sales_offices_address = str(message_list[11])
                property_developer = str(message_list[12])
                property_management_company = str(message_list[13])
                the_latest_opening = str(message_list[14])
                property_type = str(message_list[15])
                handing_room_time = str(message_list[16])
                plot_ratio = str(message_list[17])
                property_right_year = str(message_list[18])
                greening_rate = str(message_list[19])
                planning_num = str(message_list[20])
                property_cost = str(message_list[21])
                parking_situation = str(message_list[22])
                heating_method = str(message_list[23])
                water_supply_method = str(message_list[24])
                electricity_supply_method = str(message_list[25])
                building_type = str(message_list[26])
                disgust_facilities = str(message_list[27])
                cover_area = str(message_list[28])
                building_area2 = str(message_list[29])
                time.sleep(0.2)
                # print(image,building_name,average_price,building_area,district,bussiness_district,house_type,house_orientation,house_img,user_comment,project_address,sales_offices_address,property_developer,property_management_company,the_latest_opening,property_type,handing_room_time,plot_ratio,property_right_year,greening_rate,planning_num,property_cost,parking_situation,heating_method,water_supply_method,electricity_supply_method,building_type,disgust_facilities,cover_area,building_area2)
                # sql = "insert into rooms(title,image,bedroom_num,living_room_num,area,floor,floors,agent,neighborhood,city_area,bussiness_area,address,rent_way,face_direction,subline,price,updatetime) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"%(title,image,bedroom_num,living_room_num,area,floor,floors,agent,neighborhood,city_area,bussiness_area,address,rent_way,face_direction,subline,price,updatetime)
                sql = "insert into lianjia(image,building_name,average_price,building_area,district,bussiness_district,house_type,house_orientation,user_comment,project_address,sales_offices_address,property_developer,property_management_company,the_latest_opening,property_type,handing_room_time,plot_ratio,property_right_year,greening_rate,planning_num,property_cost,parking_situation,heating_method,water_supply_method,electricity_supply_method,building_type,disgust_facilities,cover_area,building_area2) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
                with  lock:#必须加锁
                    try:
                        cursor.execute(sql, (
                        image, building_name, average_price, building_area, district, bussiness_district, house_type,
                        house_orientation, user_comment[0], project_address, sales_offices_address,
                        property_developer, property_management_company, the_latest_opening, property_type,
                        handing_room_time, plot_ratio, property_right_year, greening_rate, planning_num, property_cost,
                        parking_situation, heating_method, water_supply_method, electricity_supply_method, building_type,
                        disgust_facilities, cover_area, building_area2))
                        # cursor.execute(sql, (i for i in message_list))
                        connect.commit()
                        time.sleep(1)
                        print('数据插入成功')
                    except:
                        print(
                        image, building_name, average_price, building_area, district, bussiness_district, house_type,
                        house_orientation, user_comment[0], project_address, sales_offices_address,
                        property_developer, property_management_company, the_latest_opening, property_type,
                        handing_room_time, plot_ratio, property_right_year, greening_rate, planning_num, property_cost,
                        parking_situation, heating_method, water_supply_method, electricity_supply_method, building_type,
                        disgust_facilities, cover_area, building_area2)
                        print("插入数据失败")
                print(building_url)
if __name__ == '__main__':
    lock=threading.Lock()
    with open('index.html', 'r', encoding='utf-8')as  fq:
        response = fq.read()
    tree = etree.HTML(response)
    city_list = tree.xpath('//div[@class="fc-main clear"]//li/div/a')
    q=Queue()
    name=[1,2,3,4,5,6,7,8,9]
    for city in city_list:
        q.put(city)
    for  n in name:
        house=lianjia()
        house.start()



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值