import requests,re
from openpyxl.workbook import Workbook
from openpyxl.writer.excel import ExcelWriter
def get_page(url):
headers = {
"User-Agent":"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; de-de) AppleWebKit/534.15+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4"
}
response = requests.get(url,headers=headers)
return response.text
def parse_page(html):
pattern = re.compile(
"price\"><span>(.*?)</span>(.*?)</div>.*?data-housecode=\"(.*?)\".*?data-el=.*?>(.*?)</a><div class=.*?>(.*?)<span>/</span>(.*?)<span>/</span>(.*?)<span>/</span>(.*?)<span>/</span>(.*?)</div>.*?<span class=.*?>(.*?)<.*?<span class=.*?>(.*?)<.*?<span class=.*?>(.*?)<.*?",
re.S
)
items = re.findall(pattern,html)
return items
def parse_all():
property_data = []
for i in range(1,101):
url = "https://sh.lianjia.com/ershoufang/pg{0}/?utm_source=baidu&utm_medium=pinzhuan&utm_term=biaoti&utm_content=biaotimiaoshu&utm_campaign=sousuo&ljref=pc_sem_baidu_ppzq_x".format(i)
html = get_page(url)
item_list = parse_page(html)
property_data.extend(item_list)
return property_data
def save_property_list
爬虫爬取链家房源数据并保存本地excel或txt
最新推荐文章于 2023-08-10 19:47:03 发布
本文介绍如何使用正则表达式和HTTPS技术抓取链家网站上的房源信息,详细解析HTML内容,最终将数据保存为Excel或TXT文件,实现房源数据的自动化收集与分析。
摘要由CSDN通过智能技术生成