# python 3.x
# encoding: utf-8
# 创建者:jzp
# date: 2021.08.09
import pandas as pd
import requests
from lxml import etree
import numpy as np
import time
import threading
from datetime import datetime
import queue as Queue
def get_response(url):
"""
:param url: 二手房详情链接地址
页
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
'cookie': 'lianjia_uuid=d97adab2-3725-48d7-8bb9-bc7c2307f085; _smt_uid=60a25212.2c0d0578; UM_distinctid=1797a1097846c0-05531f4bb4eff2-2363163-1e6000-1797a109785349; _ga=GA1.2.1379346907.1621250581; _jzqy=1.1621250579.1628512694.4.jzqsr=baidu.jzqsr=baidu; _jzqx=1.1628596538.1628777187.1.jzqsr=su%2Elianjia%2Ecom|jzqct=/chengjiao/.-; select_city=320500; lianjia_ssid=373b966b-7e27-44e0-bd75-4746b6fc150f; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1628512698,1628593574,1628768037,1629004394; CNZZDATA1254525908=1947407041-1621245490-null%7C1629001874; CNZZDATA1254525948=1979302765-1621247107-null%7C1628999326; CNZZDATA1255633284=328011018-1621247107-null%7C1629001344; CNZZDATA1255604082=1744875140-1621247107-null%7C1629001432; _qzjc=1; _jzqa=1.3950590266541956600.1621250579.1628777187.1629004395.15; _jzqc=1; _jzqckmp=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221797a109bf9215-0b527eee217d58-2363163-1990656-1797a109bfaa61%22%2C%22%24device_id%22%3A%221797a109bf9215-0b527eee217d58-2363163-1990656-1797a109bfaa61%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_utm_source%22%3A%22baidu%22%2C%22%24latest_utm_medium%22%3A%22pinzhuan%22%2C%22%24latest_utm_campaign%22%3A%22wysuzhou%22%2C%22%24latest_utm_content%22%3A%22biaotimiaoshu%22%2C%22%24latest_utm_term%22%3A%22biaoti%22%7D%7D; _gid=GA1.2.1097917226.1629004396; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1629004435; _qzja=1.1303379099.1621250579127.1628777187219.1629004394594.1629004394594.1629004434877.0.0.0.103.15; _qzjb=1.1629004394594.2.0.0.0; _qzjto=2.1.0; _jzqb=1.2.10.1629004395.1; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiODY0NWFhMDUzYzAwMWEyNWM4NGQzZDNmZTFiNTMwOGYzMzU2MmZhN2M5YmZiMDMyZDk3MTRmZTUxN2Y5NDU1NmEwZGNiYTEwZWYzZjY1YWI3NWExZjljNGFmYjEzNjI5MDMyZDlkY2U5NmJlYmEyOTZhMTk4NDI5YTZjMTkzN2Y1MmY0ZjE2NWExNjg5NjgzNTJkN2M0NGZmOGJkMTY4N2E2ZGE0OWNkY2ZkMjY0YTFjZjViZmU4MTllOTg5MjA0YjhmZThiNDM0ZTQwMjdjZDM2ZDUwZWNiYWIxOTBkY2QwNTI5M2NiNjA3YjExYjBjMTE1YWNkNTAwODRjMTE4ZDlmNDhjZWE0YzJhZThmM2NhMzBiYzQ3YjhiNDY2OWQzMGUxMTQ2MmZkNGJlOWRhYzQzZWMyZjZhZDI5NjZlMDI0NTkxZGM4NTIyZTc1Mzc1NTFjMTJjOWFlMmI4MGE2M1wiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCI2N2Q5NmMzNlwifSIsInIiOiJodHRwczovL3N1LmxpYW5qaWEuY29tL2NoZW5namlhby8xMDcxMDMzMzk0NzkuaHRtbCIsIm9zIjoid2ViIiwidiI6IjAuMSJ9',
'referer': 'https://su.lianjia.com/chengjiao/',
'Host': 'su.lianjia.com'
}
# proxies = get_ip()
try:
response = requests.get(url=url, headers=headers, proxies=proxies)
except:
response = requests.get(url=url, headers=headers)
# print(response.status_code)
response.encoding = 'utf-8'
response = response.text
return response
def get_url():
"""
:param file_path: 二手房数据集
:return: 链接列表
"""
df = pd.read_csv(file)
return list(df['房源链接'])
## 在爬取位置为空时,插入空字符
def fillna(html, xpath):
content = html.xpath(xpath)
if not content:
return [' ']
else:
return content
def get_data(thread_name, q):
url = q.get(timeout=2) # 设置超时异常,当进行到队列末尾,无链接获取。
try:
response = get_response(url)
html = etree.HTML(response)
print(q.qsize(), thread_name)
list_extend(url, html)
print(f'{url}爬取结束!!')
print(len(url_list), len(community_url), len(times_of_scan), len(times_of_adprice), len(popular_attention),
len(room_type), len(room_construct),
len(buding_type), len(buding_construct), len(pct_houseoflift))
except Exception as e:
print(q.qsize(), thread_name, url, 'Error', e)
# finally:
# mu.release()
# response = get_response(url)
# html = etree.HTML(response)
# print(q.qsize(), thread_name)
# list_extend(html)
# print(f'{url}爬取结束!!')
# print(len(community_url), len(times_of_scan), len(times_of_adprice), len(popular_attention),
# len(room_type), len(room_construct),
# len(buding_type), len(buding_construct), len(pct_houseoflift))
time.sleep(1) # 防止封锁IP
def list_extend(url, html):
mu.acquire()
url_list.append(url)
community_url.extend(fillna(html, "//div[@class='deal-bread']/a[5]/@href"))
times_of_adprice.extend(fillna(html, "//div[@class='msg']/span[3]/label/text()"))
popular_attention.extend(fillna(html, "//div[@class='msg']/span[5]/label/text()"))
times_of_scan.extend(fillna(html, "//div[@class='msg']/span[6]/label/text()"))
room_type.extend(fillna(html, "//div[@class='base']/div[@class='content']/ul/li[1]/text()"))
room_construct.extend(fillna(html, "//div[@class='base']/div[@class='content']/ul/li[4]/text()"))
buding_type.extend(fillna(html, "//div[@class='base']/div[@class='content']/ul/li[6]/text()"))
buding_construct.extend(fillna(html, "//div[@class='base']/div[@class='content']/ul/li[10]/text()"))
pct_houseoflift.extend(fillna(html, "//div[@class='base']/div[@class='content']/ul/li[12]/text()"))
mu.release()
def df_concat():
## 建立数据框
columns = {'房源链接': url_list,
'小区链接': community_url,
'调价次数': times_of_adprice,
'关注人数': popular_attention,
'浏览次数': times_of_scan,
'房屋户型': room_type,
'户型结构': room_construct,
'建筑类型': buding_type,
'建筑结构': buding_construct,
'梯户比例': pct_houseoflift
}
df_detail = pd.DataFrame(data=columns)
df_all = pd.merge(df_saled[['房源名称', '房源链接', '朝向', '成交日期', '房源楼层', '成交周期', '挂牌价', '成交价', '房屋年龄', '房屋单价']], df_detail,
on='房源链接', how='left')
return df_all
def get_ip():
# 获取IP列表
f = open('IP池.txt', 'r', encoding='utf-8-sig')
ips = f.read()
ip_list = []
for ip in ips.split('\n'):
ip_dict = {'http': ip}
ip_list.append(ip_dict)
f.close()
proxies = np.random.choice(ip_list)
return proxies
## 多线程
class myThread(threading.Thread):
def __init__(self, name, q):
threading.Thread.__init__(self)
self.name = name # 线程名称
self.q = q # 从队列中获取的内容
def run(self):
print('Starting' + self.name)
while True:
try:
get_data(self.name, self.q)
except:
break
print("Exiting" + self.name)
if __name__ == '__main__':
mu = threading.Lock()
today = datetime.now().strftime('%Y%m%d')
file = './链家成交二手房主页信息' + str(today) + '.csv'
# file = '链家成交二手房主页信息.csv'
# file = '链家成交二手房主页信息.csv'
df_saled = pd.read_csv(file, encoding='utf-8')
url_list = [] # 房源链接,防止数据串行
community_url = [] # 小区链接(包含小区编号)
times_of_adprice = [] # 调价次数
popular_attention = [] # 关注人数
times_of_scan = [] # 浏览次数
room_type = [] # 房屋类型
room_construct = [] # 户型结构
buding_type = [] # 建筑类型
buding_construct = [] # 建筑结构
pct_houseoflift = [] # 梯户比例
thread_list = ['Thread-1', 'Thread-2', 'Thread-3', 'Thread-4', 'Thread-5', 'Thread-6', 'Thread-7', 'Thread-8',
'Thread-9', 'Thread-10']
workQueue = Queue.Queue(len(get_url()))
# workQueue = Queue.Queue(6)
threads = []
for t_name in thread_list:
thread = myThread(t_name, workQueue)
thread.start()
threads.append(thread)
# 填充队列
for url in df_saled['房源链接']:
workQueue.put(url)
# 等待所有线程完成
for t in threads:
t.join()
df_all = df_concat()
print(df_all)
df_all.to_csv(file, encoding='utf-8-sig', index=False)
小区详情页数据
最新推荐文章于 2024-10-01 20:02:09 发布