import requests
from bs4 import BeautifulSoup
import urllib.request
from pypinyin import lazy_pinyin
import os
import csv
import stat
import re
city=''
chengshi=''
city_list=[]
headrow=['序号','地址','户型','面积(平米)','单价(元/平)','总价(万)','关注度(人)','发布时间','链接']
pic_code=1
url=''
base_url=''
url_tail_a=''
url_tail_b=''
def get_init(chengshi):
if chengshi=='哈尔滨':
return 'hrb'
s=''
for i in lazy_pinyin(chengshi):#返回一个包括整个拼音的列表
s+=(i[0])
return s
def get_request(page):# 获取请求用户的定制
global url,base_url
headers={
'Cookie':
'lianjia_uuid=b954b713-8673-41bb-ae40-c7b789c191f2; _ga=GA1.2.782127310.1725084192; login_ucid=2000000038925211; lianjia_token=2.0012444e2c6a8cd05b03e9671d31b0a9a5; lianjia_token_secure=2.0012444e2c6a8cd05b03e9671d31b0a9a5; security_ticket=JvihxL9HT6A2dfOzM7hDxZgOvIZBZZWKN8wlA5bzNUkWUa0UEiJVCoKwoFIWeQeOTW6XGozZ/DA6gCd4bnhnKDk3gNsH3YCFuAvdY77ESdC85x2kh55AD4UkB+6jLLD0WrjxCLKyIrlghRXayR81fMQCCESR14oDLjAiQX0VFys=; ftkrc_=9fa10feb-3ea4-4693-959e-97e02cd86126; lfrc_=63aae4fd-09ce-471a-898e-3c3a19add881; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22191a707757c18dd-03fe4a47fb0174-4c657b58-1821369-191a707757d88e%22%2C%22%24device_id%22%3A%22191a707757c18dd-03fe4a47fb0174-4c657b58-1821369-191a707757d88e%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; _ga_KJTRWRHDL1=GS1.2.1725240066.3.1.1725242204.0.0.0; _ga_QJN1VP0CMS=GS1.2.1725240066.3.1.1725242204.0.0.0; Hm_lvt_46bf127ac9b856df503ec2dbf942b67e=1725084169,1725238994,1725369493; HMACCOUNT=3FE6DFDF13B14547; _gid=GA1.2.88217649.1725369505; _ga_W9S66SNGYB=GS1.2.1725369506.8.1.1725370466.0.0.0; _ga_1W6P4PWXJV=GS1.2.1725369506.8.1.1725370466.0.0.0; _ga_N51MBR7HR4=GS1.2.1725426803.7.0.1725426803.0.0.0; _ga_WLZSQZX7DE=GS1.2.1725427375.3.1.1725427386.0.0.0; _ga_TJZVFLS7KV=GS1.2.1725427375.3.1.1725427386.0.0.0; lianjia_ssid=46b06c27-5a28-4b1a-b534-5277d136edad; hip=MFhgIa8WkHh8h4j5-YQb6L0wDZRQHH6hfmAiQ6UJYnr65G34-p_HwKDN9SM3Kz6ORWF5FObMUBaajYzO_xn4KskmZn8nRuThPO4J_cG5oXNMwO0zLBX52_fVOYlKmdxgRz07zCe8cPxHCt33FFPqW3c3CbFRcErDrA0lI7L_V05yrTbQEP0N-zDrzw%3D%3D; select_city=320100; Hm_lpvt_46bf127ac9b856df503ec2dbf942b67e=1725452178; Qs_lvt_200116=1725452178; Qs_pv_200116=335356058542352100; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiZmIxNDk1NGQ4NDllNzdmNWI4ZGIyMTBkYTMyMjUyMWFkM2M1NDYzY2Q5NGQ4MTQ2ZWY0NGNhMGVjMTY4ZjJlZjI2MGY1ZmM3MDJmYTJjOGQwMDE1ZTJmOTQyMDk2ZWI1OTcxYzdlOGFjMTM1ZDQ1NzBjMWU3YTQyMTM0OTc1ZGNmM2EyNWVlNTAwZGQ2OTNjZjhlNDkxYTY5MzNhMjczZjYxODg5NWU3NzJhYTEyZGExYmU4NTg4M2JlNDE5ZTA3ZTcwNzg0MTI1ZDA3OTg1ODExMjI4ZmQ3MmU5NWRjZTM3NmRlYjI1YWMxYjNiMTBkYzA1Mjg4M2U4YmUxZGE2YlwiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCI3MjBmZDZiZlwifSIsInIiOiJodHRwczovL25qLmxpYW5qaWEuY29tL2Vyc2hvdWZhbmcveXVodWF0YWkvcGcxcDMvIiwib3MiOiJ3ZWIiLCJ2IjoiMC4xIn0=; _gat=1; _gat_past=1; _gat_global=1; _gat_new_global=1; _gat_dianpu_agent=1; _ga_MFYNHLJT0H=GS1.2.1725452189.1.0.1725452189.0.0.0; _ga_E91JCCJY3Z=GS1.2.1725452189.1.0.1725452189.0.0.0'
,
'user-agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0'
}
url=base_url+url_tail_a+'pg'+str(page)+url_tail_b+'/'
print(url)
request=urllib.request.Request(url=url,headers=headers)
return request
def get_html(request):# 获取网页数据
response=urllib.request.urlopen(request)#<class 'http.client.HTTPResponse'>
content=response.read().decode('utf-8')#content为字符串
# print(content)
with open('wb.html','w',encoding='utf-8') as f:
f.write(content)
soup=BeautifulSoup(content,'html.parser')
return soup
def get_info(soup):# 筛选数据
all = soup.find_all('div', attrs={'class': 'houseInfo'}) # <class 'bs4.element.ResultSet'>
all_price = soup.find_all('div', attrs={'class': "totalPrice totalPrice2"})
all_unit=soup.find_all('div',attrs={ 'class':"unitPrice"})
all_posi = soup.find_all('div', attrs={'class': "positionInfo"})
all_follow = soup.find_all('div' ,attrs={'class':"followInfo"})
all_imag=soup.find_all('img',attrs={'class': "lj-lazy"})
all_html=soup.find_all('a',attrs={'class':"noresultRecommend img LOGCLICKDATA"})
data_lis=[]
global chengshi,pic_code,docu_imag_path
for item_posi, item, item_price,item_unit,item_follow,item_imag,item_html in zip(all_posi,all,all_price,all_unit,all_follow,all_imag,all_html): # <class 'bs4.element.Tag'>
item_html=item_html.get("href")
singal_lis=[]
# 序号
singal_lis.append(str(pic_code))
# 位置
item_posi=item_posi.get_text().split('-')
item_posi=(item_posi[0]+'-'+item_posi[1])
singal_lis.append(item_posi)
# 户型
item=item.get_text().split('|')
area=item[1]
item.pop(1)
item='|'.join(item)
singal_lis.append(item)
# 面积
singal_lis.append(area.replace('平米',''))
# 单价
singal_lis.append(item_unit.get_text().replace('元/平',''))
# 总价
singal_lis.append(item_price.get_text().replace('万',''))
# 关注度
item_follow=item_follow.get_text().split('/')
time=item_follow[1]
item_follow=item_follow[0]
singal_lis.append(item_follow.replace('人关注',''))
# 发布时间
singal_lis.append(time)
# 链接
singal_lis.append(item_html)
# 下载图片
imag_scr = item_imag.get("data-original")
imag_response = requests.get(imag_scr)
imag_path = os.path.join(docu_imag_path, "{}.jpg".format(pic_code))
with open(imag_path, 'wb') as f:
f.write(imag_response.content)
pic_code += 1
data_lis.append(singal_lis)
return data_lis
def print_info(list):# 打印数据
global writer
for row in list:
writer.writerow(row)# 写入表格中的一行
def delete_former():
current_path=os.getcwd()
files=os.listdir(current_path)
important_files=['spider.py','__pycache__','wb.html','function.py','test.py','data_analysis.py']
del_files=[file for file in files if file not in important_files]
# print(del_files)
for file in del_files:
del_path=os.path.join(current_path,file)
os.chmod(del_path,stat.S_IWUSR | stat.S_IWOTH)
os.remove(del_path)
def input_demand():
global url_tail_a,url_tail_b,base_url,city,chengshi,docu_imag_path
# 用户键入城市
chengshi=input("请输入城市:")
while 1:
if chengshi in city_list:
break
print("输入错误!")
chengshi=input("请重新输入城市:")
print("正在加载中----------")
city=get_init(chengshi)
# 获得对应城市的相关区域
base_url = 'https://{}.lianjia.com/ershoufang/'.format(city)
headers = {
'Cookie':
'lianjia_uuid=b954b713-8673-41bb-ae40-c7b789c191f2; _ga=GA1.2.782127310.1725084192; login_ucid=2000000038925211; lianjia_token=2.0012444e2c6a8cd05b03e9671d31b0a9a5; lianjia_token_secure=2.0012444e2c6a8cd05b03e9671d31b0a9a5; security_ticket=JvihxL9HT6A2dfOzM7hDxZgOvIZBZZWKN8wlA5bzNUkWUa0UEiJVCoKwoFIWeQeOTW6XGozZ/DA6gCd4bnhnKDk3gNsH3YCFuAvdY77ESdC85x2kh55AD4UkB+6jLLD0WrjxCLKyIrlghRXayR81fMQCCESR14oDLjAiQX0VFys=; ftkrc_=9fa10feb-3ea4-4693-959e-97e02cd86126; lfrc_=63aae4fd-09ce-471a-898e-3c3a19add881; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22191a707757c18dd-03fe4a47fb0174-4c657b58-1821369-191a707757d88e%22%2C%22%24device_id%22%3A%22191a707757c18dd-03fe4a47fb0174-4c657b58-1821369-191a707757d88e%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; _ga_KJTRWRHDL1=GS1.2.1725240066.3.1.1725242204.0.0.0; _ga_QJN1VP0CMS=GS1.2.1725240066.3.1.1725242204.0.0.0; Hm_lvt_46bf127ac9b856df503ec2dbf942b67e=1725084169,1725238994,1725369493; HMACCOUNT=3FE6DFDF13B14547; _gid=GA1.2.88217649.1725369505; _ga_W9S66SNGYB=GS1.2.1725369506.8.1.1725370466.0.0.0; _ga_1W6P4PWXJV=GS1.2.1725369506.8.1.1725370466.0.0.0; _ga_N51MBR7HR4=GS1.2.1725426803.7.0.1725426803.0.0.0; _ga_WLZSQZX7DE=GS1.2.1725427375.3.1.1725427386.0.0.0; _ga_TJZVFLS7KV=GS1.2.1725427375.3.1.1725427386.0.0.0; lianjia_ssid=46b06c27-5a28-4b1a-b534-5277d136edad; hip=MFhgIa8WkHh8h4j5-YQb6L0wDZRQHH6hfmAiQ6UJYnr65G34-p_HwKDN9SM3Kz6ORWF5FObMUBaajYzO_xn4KskmZn8nRuThPO4J_cG5oXNMwO0zLBX52_fVOYlKmdxgRz07zCe8cPxHCt33FFPqW3c3CbFRcErDrA0lI7L_V05yrTbQEP0N-zDrzw%3D%3D; select_city=320100; Hm_lpvt_46bf127ac9b856df503ec2dbf942b67e=1725452178; Qs_lvt_200116=1725452178; Qs_pv_200116=335356058542352100; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiZmIxNDk1NGQ4NDllNzdmNWI4ZGIyMTBkYTMyMjUyMWFkM2M1NDYzY2Q5NGQ4MTQ2ZWY0NGNhMGVjMTY4ZjJlZjI2MGY1ZmM3MDJmYTJjOGQwMDE1ZTJmOTQyMDk2ZWI1OTcxYzdlOGFjMTM1ZDQ1NzBjMWU3YTQyMTM0OTc1ZGNmM2EyNWVlNTAwZGQ2OTNjZjhlNDkxYTY5MzNhMjczZjYxODg5NWU3NzJhYTEyZGExYmU4NTg4M2JlNDE5ZTA3ZTcwNzg0MTI1ZDA3OTg1ODExMjI4ZmQ3MmU5NWRjZTM3NmRlYjI1YWMxYjNiMTBkYzA1Mjg4M2U4YmUxZGE2YlwiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCI3MjBmZDZiZlwifSIsInIiOiJodHRwczovL25qLmxpYW5qaWEuY29tL2Vyc2hvdWZhbmcveXVodWF0YWkvcGcxcDMvIiwib3MiOiJ3ZWIiLCJ2IjoiMC4xIn0=; _gat=1; _gat_past=1; _gat_global=1; _gat_new_global=1; _gat_dianpu_agent=1; _ga_MFYNHLJT0H=GS1.2.1725452189.1.0.1725452189.0.0.0; _ga_E91JCCJY3Z=GS1.2.1725452189.1.0.1725452189.0.0.0'
,
'user-agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0'
}
request = urllib.request.Request(base_url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
# print(content)
soup = BeautifulSoup(content, 'html.parser')
all_section = soup.select('div[data-role="ershoufang"]>div>a')
all_section=[section.get_text() for section in all_section]
print('-'.join(all_section))
section = input('[若不需要选择区域输入1]\n请选择你要查询的区域:')
if section!='1':
while 1:
if section in all_section:
break
print('输入错误!')
section = input('请重新输入你要查询的区域:')
# 找到该字符串所对应的标签,再找其父标签的'href'
url_tail_a=soup.find(string=re.compile(section)).parent.get('href')
url_tail_a=url_tail_a.split('/')[2]+'/'
# print(url_tail_a)
# 用户键入具体需求
all_demand=soup.select('.name')
all_demand=[demand.get_text() for demand in all_demand]
print('-'.join(all_demand))
demands = input('[若不需要输入1]\n请选择你的需求:')
if demands!='1':
demands=demands.replace(',',' ')# 如果用户用逗号分隔多个需求,统一为空格
demands=demands.split()# 列表
for demand in demands:
if demand not in all_demand:
print('输入错误!')
print('没有 [{}]'.format(demand))
demands.remove(demand)
continue
deal_singal_demand(demand,soup)
start = int(input("请输入开始爬取的页数:"))
end = int(input("请输入爬取结束的页数:"))
# 进行csv文件预处理
old_filename = '{}-{}-{}data.csv'.format(chengshi,section,'-'.join(demands))
fcsv = open(old_filename, 'w', newline='', encoding='gb18030')#utf-8可以在爬取过多数据的情况下,保证不报错,但是会乱码
global writer #gb18030不会乱码,但大量数据时会报错
writer = csv.writer(fcsv)
writer.writerow(headrow) # 写入表头
# 创建对应城市的照片文件夹
docu_imag_name=chengshi+'-'+section+'-'.join(demands)
docu_imag_path = os.path.join(os.getcwd(), docu_imag_name)
if not os.path.exists(docu_imag_path):
os.makedirs(docu_imag_path)
return start,end
def deal_singal_demand(demand,soup):
global url_tail_b
lis = soup.find(string=re.compile(demand)).parent.parent.get('href')
lis=lis.split('/')
url_tail_b += lis[2]
def operation(start,end):
# 循环爬取每一页 左闭右开所以end+1
for page in range(start, end+1):
request = get_request(page)
soup = get_html(request)
lis = get_info(soup)
print_info(lis)
def get_city_list():
print("正在加载中----------")
global city_list
url='https://www.lianjia.com/city/'
headers = {
'Cookie':
'lianjia_uuid=b954b713-8673-41bb-ae40-c7b789c191f2; _ga=GA1.2.782127310.1725084192; login_ucid=2000000038925211; lianjia_token=2.0012444e2c6a8cd05b03e9671d31b0a9a5; lianjia_token_secure=2.0012444e2c6a8cd05b03e9671d31b0a9a5; security_ticket=JvihxL9HT6A2dfOzM7hDxZgOvIZBZZWKN8wlA5bzNUkWUa0UEiJVCoKwoFIWeQeOTW6XGozZ/DA6gCd4bnhnKDk3gNsH3YCFuAvdY77ESdC85x2kh55AD4UkB+6jLLD0WrjxCLKyIrlghRXayR81fMQCCESR14oDLjAiQX0VFys=; ftkrc_=9fa10feb-3ea4-4693-959e-97e02cd86126; lfrc_=63aae4fd-09ce-471a-898e-3c3a19add881; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22191a707757c18dd-03fe4a47fb0174-4c657b58-1821369-191a707757d88e%22%2C%22%24device_id%22%3A%22191a707757c18dd-03fe4a47fb0174-4c657b58-1821369-191a707757d88e%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; _ga_KJTRWRHDL1=GS1.2.1725240066.3.1.1725242204.0.0.0; _ga_QJN1VP0CMS=GS1.2.1725240066.3.1.1725242204.0.0.0; Hm_lvt_46bf127ac9b856df503ec2dbf942b67e=1725084169,1725238994,1725369493; HMACCOUNT=3FE6DFDF13B14547; _gid=GA1.2.88217649.1725369505; _ga_W9S66SNGYB=GS1.2.1725369506.8.1.1725370466.0.0.0; _ga_1W6P4PWXJV=GS1.2.1725369506.8.1.1725370466.0.0.0; _ga_N51MBR7HR4=GS1.2.1725426803.7.0.1725426803.0.0.0; _ga_WLZSQZX7DE=GS1.2.1725427375.3.1.1725427386.0.0.0; _ga_TJZVFLS7KV=GS1.2.1725427375.3.1.1725427386.0.0.0; lianjia_ssid=46b06c27-5a28-4b1a-b534-5277d136edad; hip=MFhgIa8WkHh8h4j5-YQb6L0wDZRQHH6hfmAiQ6UJYnr65G34-p_HwKDN9SM3Kz6ORWF5FObMUBaajYzO_xn4KskmZn8nRuThPO4J_cG5oXNMwO0zLBX52_fVOYlKmdxgRz07zCe8cPxHCt33FFPqW3c3CbFRcErDrA0lI7L_V05yrTbQEP0N-zDrzw%3D%3D; select_city=320100; Hm_lpvt_46bf127ac9b856df503ec2dbf942b67e=1725452178; Qs_lvt_200116=1725452178; Qs_pv_200116=335356058542352100; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiZmIxNDk1NGQ4NDllNzdmNWI4ZGIyMTBkYTMyMjUyMWFkM2M1NDYzY2Q5NGQ4MTQ2ZWY0NGNhMGVjMTY4ZjJlZjI2MGY1ZmM3MDJmYTJjOGQwMDE1ZTJmOTQyMDk2ZWI1OTcxYzdlOGFjMTM1ZDQ1NzBjMWU3YTQyMTM0OTc1ZGNmM2EyNWVlNTAwZGQ2OTNjZjhlNDkxYTY5MzNhMjczZjYxODg5NWU3NzJhYTEyZGExYmU4NTg4M2JlNDE5ZTA3ZTcwNzg0MTI1ZDA3OTg1ODExMjI4ZmQ3MmU5NWRjZTM3NmRlYjI1YWMxYjNiMTBkYzA1Mjg4M2U4YmUxZGE2YlwiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCI3MjBmZDZiZlwifSIsInIiOiJodHRwczovL25qLmxpYW5qaWEuY29tL2Vyc2hvdWZhbmcveXVodWF0YWkvcGcxcDMvIiwib3MiOiJ3ZWIiLCJ2IjoiMC4xIn0=; _gat=1; _gat_past=1; _gat_global=1; _gat_new_global=1; _gat_dianpu_agent=1; _ga_MFYNHLJT0H=GS1.2.1725452189.1.0.1725452189.0.0.0; _ga_E91JCCJY3Z=GS1.2.1725452189.1.0.1725452189.0.0.0'
,
'user-agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0'
}
request=urllib.request.Request(url,headers=headers)
response=urllib.request.urlopen(request)
content=response.read().decode('utf-8')
soup=BeautifulSoup(content,'html.parser')
city_list=soup.select('div[class="city_list"]>div>ul>li>a')
city_list=[city.get_text() for city in city_list]
from function import input_demand
from function import operation
from function import get_city_list
from function import delete_former
get_city_list()
# delete_former()
start,end=input_demand()
operation(start,end)
可实现,但未实现的功能:
1 需求分行展示
2 判断是否空页,及时提示用户
3 每次程序开始时将上次产生的数据删除(总是没权限)