import requests
from lxml import etree
import threading
from queue import Queue
import time
import re
import pymysql
def is_none(message_list):
if message_list:
return message_list[0]
else:
str = '无'
return str
class lianjia(threading.Thread):
def __init__(self):
super().__init__()#继承父类
def run(self):
while True:
if q.empty():
break
try:
city=q.get()
self.shuju(city)
except:
pass
def shuju(self,city):
city_name = city.xpath('./text()')[0]
city_url = 'https:' + city.xpath('./@href')[0]
print(city_name, city_url)
url = city_url + '/loupan'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
response1 = requests.get(url=url, headers=headers).text
tree = etree.HTML(response1)
page = tree.xpath('//div[@class="page-box"]/@data-total-count')[0]
page = int(page)
if page / 10 == page // 10:
page_num = page // 10
else:
page_num = page // 10 + 1
for i in range(1, page_num + 1):
url = city_url + '/loupan/pg' + str(i) + '/'
print(url)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
response = requests.get(url=url, headers=headers).text
tree = etree.HTML(response)
listpage_message = tree.xpath('//ul[@class="resblock-list-wrapper"]/li')
for message in listpage_message:
all_message_list = []
image = message.xpath('./a/img/@data-original')
image = is_none(image)
all_message_list.append(image)
building_name = message.xpath(
'./div[@class="resblock-desc-wrapper"]/div[@class="resblock-name"]/a/text()')
building_name = is_none(building_name)
all_message_list.append(building_name)
average_price1 = message.xpath(
'./div[@class="resblock-desc-wrapper"]/div[@class="resblock-price"]/div[@class="main-price"]/span[@class="number"]/text()')
average_price1 = is_none(average_price1)
average_price2 = message.xpath(
'./div[@class="resblock-desc-wrapper"]/div[@class="resblock-price"]/div[@class="main-price"]/span[@class="desc"]/text()')
average_price2 = is_none(average_price2)
average_price = average_price1 + average_price2.strip()
all_message_list.append(average_price)
building_area = message.xpath(
'./div[@class="resblock-desc-wrapper"]/div[@class="resblock-area"]/span/text()')
building_area = is_none(building_area)
all_message_list.append(building_area)
district = message.xpath(
'./div[@class="resblock-desc-wrapper"]/div[@class="resblock-location"]/span[1]/text()')
district = is_none(district)
all_message_list.append(district)
bussiness_district = message.xpath(
'./div[@class="resblock-desc-wrapper"]/div[@class="resblock-location"]/span[2]/text()')
bussiness_district = is_none(bussiness_district)
all_message_list.append(bussiness_district)
building_url = city_url + message.xpath(
'./div[@class="resblock-desc-wrapper"]/div[@class="resblock-name"]/a/
多线程爬取安客居
最新推荐文章于 2020-09-07 23:59:40 发布