import requests
from lxml import etree
import threading
from queue import Queue
import time
import re
import pymysql
def is_none(message_list):
if message_list:
return message_list[0]
else:
str = '无'
return str
class lianjia(threading.Thread):
def __init__(self):
super().__init__()#继承父类
def run(self):
while True:
if q.empty():
break
try:
city=q.get()
self.shuju(city)
except:
pass
def shuju(self,city):
city_name = city.xpath('./text()')[0]
city_url = 'https:' + city.xpath('./@href')[0]
print(city_name, city_url)
url = city_url + '/loupan'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
response1 = requests.get(url=url, headers=headers).text
tree = etree.HTML(response1)
page = tree.xpath('//div[@class="page-box"]/@data-total-count')[0]
page = int(page)
if page / 10 == page // 10:
page_num = page // 10
else:
page_num = page // 10 + 1
for i in range(1, page_num + 1):
url = city_url + '/loupan/pg' + str(i) + '/'
print(url)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
response = requests.get(url=url, headers=headers).text
tree = etree.HTML(response)
listpage_message = tree.xpath('//ul[@class="resblock-list-wrapper"]/li')
for message in listpage_message:
all_message_list = []
image = message.xpath('./a/img/@data-original')
image = is_none(image)
all_message_list.append(image)
building_name = message.xpath(
'./div[@class="resblock-desc-wrapper"]/div[@class="resblock-name"]/a/text()')
building_name = is_none(building_name)
all_message_list.append(building_name)
average_price1 = message.xpath(
'./div[@class="resblock-desc-wrapper"]/div[@class="resblock-price"]/div[@class="main-price"]/span[@class="number"]/text()')
average_price1 = is_none(average_price1)
average_price2 = message.xpath(
'./div[@class="resblock-desc-wrapper"]/div[@class="resblock-price"]/div[@class="main-price"]/span[@class="desc"]/text()')
average_price2 = is_none(average_price2)
average_price = average_price1 + average_price2.strip()
all_message_list.append(average_price)
building_area = message.xpath(
'./div[@class="resblock-desc-wrapper"]/div[@class="resblock-area"]/span/text()')
building_area = is_none(building_area)
all_message_list.append(building_area)
district = message.xpath(
'./div[@class="resblock-desc-wrapper"]/div[@class="resblock-location"]/span[1]/text()')
district = is_none(district)
all_message_list.append(district)
bussiness_district = message.xpath(
'./div[@class="resblock-desc-wrapper"]/div[@class="resblock-location"]/span[2]/text()')
bussiness_district = is_none(bussiness_district)
all_message_list.append(bussiness_district)
building_url = city_url + message.xpath(
'./div[@class="resblock-desc-wrapper"]/div[@class="resblock-name"]/a/@href')[0]
url=building_url
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
response = requests.get(url=url, headers=headers).text
tree = etree.HTML(response)
house_type = []
house_orientation = []
house_type_list = tree.xpath(
'//div[@class="mod-wrap"]/div/div[@data-index="0"]/div[@class="houselist"]/ul/li[@class="info-li"]')
for house_types in house_type_list:
one_type = house_types.xpath('./p[@class="p1"]/text()')
one_type = is_none(one_type)
house_type.append(one_type)
orientation = house_types.xpath('./p[@class="p1"]/span[@class="p1-orientation "]/text()')
orientation = is_none(orientation)
house_orientation.append(orientation)
if house_type == []:
house_type = '无'
all_message_list.append(house_type)
if house_orientation == []:
house_orientation = '无'
all_message_list.append(house_orientation)
house_img = []
house_img_list = tree.xpath(
'//div[@class="mod-wrap"]/div/div[@data-index="0"]/div[@class="houselist"]/ul/li[@class="img-li"]')
for house_imgs in house_img_list:
img = house_imgs.xpath('./img/@src')
img = is_none(img)
house_img.append(img)
if house_img_list == []:
house_img = "无"
all_message_list.append(house_img)
user_comment = []
user_comment_list = tree.xpath('//div[@class="list_box"]/ul[@class="list"]/li')
for comment in user_comment_list:
usercomment = comment.xpath(
'./div[@class="r_comment"]/div[@class="words-container"]/div[@class="words"]/text()')
usercomment = is_none(usercomment)
user_comment.append(usercomment)
if user_comment == []:
user_comment = '无'
all_message_list.append(user_comment)
project_address = tree.xpath(
'//div[@class="mod-wrap"]/div/div[@class="box-loupan"]/p[2]/span[@class="label-val"]/text()')
project_address = is_none(project_address)
all_message_list.append(project_address)
sales_offices_address = tree.xpath(
'//div[@class="mod-wrap"]/div/div[@class="box-loupan"]/p[3]/span[@class="label-val"]/text()')
sales_offices_address = is_none(sales_offices_address)
all_message_list.append(sales_offices_address)
property_developer = tree.xpath(
'//div[@class="mod-wrap"]/div/div[@class="box-loupan"]/p[4]/span[@class="label-val"]/text()')
property_developer = is_none(property_developer)
all_message_list.append(property_developer)
property_management_company = tree.xpath(
'//div[@class="mod-wrap"]/div/div[@class="box-loupan"]/p[5]/span[@class="label-val"]/text()')
property_management_company = is_none(property_management_company)
all_message_list.append(property_management_company)
other_message_list = tree.xpath('//div[@class="mod-wrap"]/div/div[@class="box-loupan"]/ul//li')
for other_message in other_message_list:
if other_message.xpath('./p/span[@class="label"]/text()') == ["最新开盘:"]:
the_latest_opening = other_message.xpath('./p/span[@class="label-val"]/text()')
the_latest_opening = is_none(the_latest_opening).strip()
all_message_list.append(the_latest_opening)
elif other_message.xpath('./p/span[@class="label"]/text()') == ["物业类型:"]:
property_type = other_message.xpath('./p/span[@class="label-val"]/text()')
property_type = is_none(property_type).strip()
all_message_list.append(property_type)
elif other_message.xpath('./p/span[@class="label"]/text()') == ["交房时间:"]:
handing_room_time = other_message.xpath('./p/span[@class="label-val"]/text()')
handing_room_time = is_none(handing_room_time).strip()
all_message_list.append(handing_room_time)
elif other_message.xpath('./p/span[@class="label"]/text()') == ["容积率:"]:
plot_ratio = other_message.xpath('./p/span[@class="label-val"]/text()')
plot_ratio = is_none(plot_ratio).strip()
all_message_list.append(plot_ratio)
elif other_message.xpath('./p/span[@class="label"]/text()') == ["产权年限:"]:
property_right_year = other_message.xpath('./p/span[@class="label-val"]/text()')
property_right_year = is_none(property_right_year).strip()
all_message_list.append(property_right_year)
elif other_message.xpath('./p/span[@class="label"]/text()') == ["绿化率:"]:
greening_rate = other_message.xpath('./p/span[@class="label-val"]/text()')
greening_rate = is_none(greening_rate).strip()
all_message_list.append(greening_rate)
elif other_message.xpath('./p/span[@class="label"]/text()') == ["规划户数:"]:
planning_num = other_message.xpath('./p/span[@class="label-val"]/text()')
planning_num = is_none(planning_num).strip()
all_message_list.append(planning_num)
elif other_message.xpath('./p/span[@class="label"]/text()') == ["物业费用:"]:
property_cost = other_message.xpath('./p/span[@class="label-val"]/text()')
property_cost = is_none(property_cost).strip()
try:
property_cost_pattern = re.compile('(\d+)')
property_cost = property_cost_pattern.findall(property_cost)[0]
except:
pass
all_message_list.append(property_cost)
elif other_message.xpath('./p/span[@class="label"]/text()') == ["车位情况:"]:
parking_situation = other_message.xpath('./p/span[@class="label-val"]/text()')
parking_situation = is_none(parking_situation).replace(' ', '')
parking_pattern = re.compile(r'(\d+)', re.S)
parking_situation_list = parking_pattern.findall(parking_situation)
parking_situation = 0
for i in parking_situation_list:
parking_situation += int(i)
parking_situation = str(parking_situation)
all_message_list.append(parking_situation)
elif other_message.xpath('./p/span[@class="label"]/text()') == ["供暖方式:"]:
heating_method = other_message.xpath('./p/span[@class="label-val"]/text()')
heating_method = is_none(heating_method).strip()
all_message_list.append(heating_method)
elif other_message.xpath('./p/span[@class="label"]/text()') == ["供水方式:"]:
water_supply_method = other_message.xpath('./p/span[@class="label-val"]/text()')
water_supply_method = is_none(water_supply_method).strip()
all_message_list.append(water_supply_method)
elif other_message.xpath('./p/span[@class="label"]/text()') == ["供电方式:"]:
electricity_supply_method = other_message.xpath('./p/span[@class="label-val"]/text()')
electricity_supply_method = is_none(electricity_supply_method).strip()
all_message_list.append(electricity_supply_method)
elif other_message.xpath('./p/span[@class="label"]/text()') == ["建筑类型:"]:
building_type = other_message.xpath('./p/span[@class="label-val"]/text()')
building_type = is_none(building_type).strip()
all_message_list.append(building_type)
elif other_message.xpath('./p/span[@class="label"]/text()') == ["嫌恶设施:"]:
disgust_facilities = other_message.xpath('./p/span[@class="label-val"]/text()')
disgust_facilities = is_none(disgust_facilities).strip()
all_message_list.append(disgust_facilities)
elif other_message.xpath('./p/span[@class="label"]/text()') == ["占地面积:"]:
cover_area = other_message.xpath('./p/span[@class="label-val"]/text()')
cover_area = is_none(cover_area).strip()[:-1]
all_message_list.append(cover_area)
elif other_message.xpath('./p/span[@class="label"]/text()') == ["建筑面积:"]:
building_area2 = other_message.xpath('./p/span[@class="label-val"]/text()')
building_area2 = is_none(building_area2).strip()[:-1]
all_message_list.append(building_area2)
message_list=all_message_list
connect = pymysql.connect(
host='localhost',
db='renting',
user='root',
password='root'
)
cursor = connect.cursor()
image = str(message_list[0])
building_name = str(message_list[1])
average_price = str(message_list[2])
building_area = str(message_list[3])
district = str(message_list[4])
bussiness_district = str(message_list[5])
house_type = str(message_list[6])
house_orientation = str(message_list[7])
house_img = str(message_list[8])
user_comment = str(message_list[9])
project_address = str(message_list[10])
sales_offices_address = str(message_list[11])
property_developer = str(message_list[12])
property_management_company = str(message_list[13])
the_latest_opening = str(message_list[14])
property_type = str(message_list[15])
handing_room_time = str(message_list[16])
plot_ratio = str(message_list[17])
property_right_year = str(message_list[18])
greening_rate = str(message_list[19])
planning_num = str(message_list[20])
property_cost = str(message_list[21])
parking_situation = str(message_list[22])
heating_method = str(message_list[23])
water_supply_method = str(message_list[24])
electricity_supply_method = str(message_list[25])
building_type = str(message_list[26])
disgust_facilities = str(message_list[27])
cover_area = str(message_list[28])
building_area2 = str(message_list[29])
time.sleep(0.2)
# print(image,building_name,average_price,building_area,district,bussiness_district,house_type,house_orientation,house_img,user_comment,project_address,sales_offices_address,property_developer,property_management_company,the_latest_opening,property_type,handing_room_time,plot_ratio,property_right_year,greening_rate,planning_num,property_cost,parking_situation,heating_method,water_supply_method,electricity_supply_method,building_type,disgust_facilities,cover_area,building_area2)
# sql = "insert into rooms(title,image,bedroom_num,living_room_num,area,floor,floors,agent,neighborhood,city_area,bussiness_area,address,rent_way,face_direction,subline,price,updatetime) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"%(title,image,bedroom_num,living_room_num,area,floor,floors,agent,neighborhood,city_area,bussiness_area,address,rent_way,face_direction,subline,price,updatetime)
sql = "insert into lianjia(image,building_name,average_price,building_area,district,bussiness_district,house_type,house_orientation,user_comment,project_address,sales_offices_address,property_developer,property_management_company,the_latest_opening,property_type,handing_room_time,plot_ratio,property_right_year,greening_rate,planning_num,property_cost,parking_situation,heating_method,water_supply_method,electricity_supply_method,building_type,disgust_facilities,cover_area,building_area2) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
with lock:#必须加锁
try:
cursor.execute(sql, (
image, building_name, average_price, building_area, district, bussiness_district, house_type,
house_orientation, user_comment[0], project_address, sales_offices_address,
property_developer, property_management_company, the_latest_opening, property_type,
handing_room_time, plot_ratio, property_right_year, greening_rate, planning_num, property_cost,
parking_situation, heating_method, water_supply_method, electricity_supply_method, building_type,
disgust_facilities, cover_area, building_area2))
# cursor.execute(sql, (i for i in message_list))
connect.commit()
time.sleep(1)
print('数据插入成功')
except:
print(
image, building_name, average_price, building_area, district, bussiness_district, house_type,
house_orientation, user_comment[0], project_address, sales_offices_address,
property_developer, property_management_company, the_latest_opening, property_type,
handing_room_time, plot_ratio, property_right_year, greening_rate, planning_num, property_cost,
parking_situation, heating_method, water_supply_method, electricity_supply_method, building_type,
disgust_facilities, cover_area, building_area2)
print("插入数据失败")
print(building_url)
if __name__ == '__main__':
lock=threading.Lock()
with open('index.html', 'r', encoding='utf-8')as fq:
response = fq.read()
tree = etree.HTML(response)
city_list = tree.xpath('//div[@class="fc-main clear"]//li/div/a')
q=Queue()
name=[1,2,3,4,5,6,7,8,9]
for city in city_list:
q.put(city)
for n in name:
house=lianjia()
house.start()
多线程爬取安客居
最新推荐文章于 2020-09-07 23:59:40 发布