from lxml import etree # from fake_useragent import UserAgent import requests from lxml import etree import numpy as np import csv import pandas as pd # # 公共部分url url = 'https://cq.lianjia.com/ershoufang' uurl = 'https://cq.lianjia.com' l=0 # # 请求头伪装 # header = {"User-Agent": fake_useragent.FakeUserAgent().random} # 全局变量 # 响应参数 res = requests.get(url) html = etree.HTML(res.text) # lxml实现 Area = html.xpath('/html/body/div[3]/div/div[1]/dl[2]/dd/div/div[1]/a') # 获取地区列表 AreaList = {} for are in Area: AreaList[are.text] = uurl + are.xpath('./@href')[0] # 地区存入列表 print(AreaList) list=[] ### 获取地区的地址 with open('./链家网数据集.csv', 'w', newline='',encoding='utf-8') as csvfile: writer = csv.writer(csvfile) writer.writerow(["名字", "区域","标题", "地址", "几室几厅", "房屋面积","朝向", "装修程度", "楼层","建造时间", "结构类型", "关注人数", "发表时间", "单价", "总价"]) for areaName, areaUrl in AreaList.items(): ### 获取区域的地址列表 rep = requests.get(areaUrl) # 数据解析 html = etree.HTML(rep.text) loc = html.xpath('/html/body/div[3]/div/div[1]/dl[2]/dd/div/div[2]/a') # 获取详细地址 locList = {} ### 地址列表 for i in loc: # print(i.xpath('./@href')[0]) locList[i.text] = uurl + str(i.xpath('./@href')[0]).split('?')[0] # 详细地址存储 ### 根据地址获取房屋的信息 for locName, locUrl in locList.items(): if locName in list: continue else: list.append(locName) print(locName, locUrl) try: resp = requests.get(locUrl) html = etree.HTML(resp.text) MaxNum = html.xpath('//*[@id="content"]/div[1]/div[2]/h2/span/text()')[0] print("共有", int(int(int(MaxNum) // 30) + 1), "页") # 循环获取所有页数 for i in range(1, int(int(int(MaxNum) // 30) + 1)): if i > 20: break print("这是第", i, "页") # 接收每页的数据 resp = requests.get(locUrl + 'pg' + str(i)) # 解析数据 html = etree.HTML(resp.text) # 获取房源信息列表 allList = html.xpath('//*[@id="content"]/div[1]/ul/li[@data-lj_evtid=21624]') # 从列表中提取数据 for li in allList: List = [] name = li.xpath('./div[1]/div[1]/a/text()') # 小区名称 address = li.xpath('./div[1]/div[2]/div/a/text()') # 房屋的所在位置 address = address[0] + address[1] # 房屋的所在位置 house_info = li.xpath('./div[1]/div[3]/div/text()') # 房间的详细信息 time_info = li.xpath('./div[1]/div[4]/text()') # 房屋的关注度 tatoll = li.xpath('./div[1]/div[6]/div[1]/span/text()') # 房屋总价 wan = li.xpath('./div[1]/div[6]/div[1]/i[2]/text()') # 万 tatoll = tatoll[0] + wan[0] # 房屋总价/万 val = li.xpath('./div[1]/div[6]/div[2]/span/text()') # 房屋单价 # print(areaName, locName, name, address, house_info, time_info, tatoll, val) List.append(areaName), List.append(locName), List.append(name[0]), List.append(address) # 对房子详情细分 k = [] k = house_info[0].split("|") for j in range(7): if j < len(k): if k[j] == "暂无数据": List.append("NULL") else: List.append(k[j]) else: List.append("NULL") # 对关注度细分 a = [] a = time_info[0].split("/") List.append(a[0]), List.append(a[1]), List.append(tatoll), List.append(val[0]) l+=1 print(l,List) writer.writerow(List) except Exception as e: print(e)
python使用xpath爬取网页数据
于 2022-07-20 17:45:32 首次发布
该博客主要讲述了使用Python的requests和lxml库爬取链家网重庆地区二手房数据的过程。首先获取地区列表,然后遍历每个地区获取详细地址,接着对每个地址的房源信息进行抓取,包括小区名称、房屋位置、房间信息、关注人数、总价等,并将数据写入CSV文件。整个过程涉及网页解析、数据提取和文件操作。
摘要由CSDN通过智能技术生成