python爬虫requests源码链家_Python爬虫案例，python爬取链家二手房信息源码

最新推荐文章于 2023-01-28 15:54:09 发布

weixin_39628864

最新推荐文章于 2023-01-28 15:54:09 发布

阅读量160

点赞数

文章标签： python爬虫requests源码链家

#爬取链家二手房信息

import requests

from bs4 import BeautifulSoup

import re

from fake_useragent import UserAgent

class HomeLinkSpider(object):

def __init__(self,url):

self.ua=UserAgent()

self.headers={"User-Agent":self.ua.random}

self.data=list()

self.url=url

def get_max_page(self):

response=requests.get(self.url,headers=self.headers)

if response.status_code==200:

soup=BeautifulSoup(response.text,'html.parser')

a=soup.select('div[class="page-box house-lst-page-box"]')

max_page=eval(a[0].attrs['page-data'])["totalPage"] #使用eval是字符串转化为字典格式

return max_page

print(max_page)

else:

return None

def parse_page(self):

max_page=self.get_max_page()

print(max_page)

for i in range(1,max_page+1):

url="{}pg()/".format(self.url,i)

response=requests.get(url,headers=self.headers)

soup=BeautifulSoup(response.text,'html.parser')

ul=soup.find_all("ul",class_="sellListContent")

li_list=ul[0].select("li")

#print(li_list)

for li in li_list:

detail=dict()

detail["titile"]=li.select('div[class="title"]')[0].get_text()

# 大华锦绣华城(九街区) | 3室2厅 | 76.9平米 | 南 | 其他 | 无电梯

house_info=li.select('div[class="houseInfo"]')[0].get_text()

house_info_list=house_info.split(" | ")

detail['house']=house_info_list[0]

detail['bedroom']=house_info_list[1]

detail['area']=house_info_list[2]

detail['direction']=house_info_list[3]

# 低楼层(共7层)2006年建板楼 - 张江. 提取楼层，年份和板块

position_info=li.select('div[class="positionInfo"]')[0].get_text().split(" - ")

floor_pattern = re.compile(r'.+\)')

match1=re.search(floor_pattern,position_info[0]) #从字符串任意位置匹配

if match1:

detail['floor']=match1.group()

else:

detail['floor']="未知"

detail['floor'] = re.search(floor_pattern, position_info[0]).group() #从字符串任意位置匹配

year_pattern=re.compile(r'\d{4}')

match2=re.search(year_pattern,position_info[0]) #从字符串任意位置匹配

if match2:

detail['year']=match2.group()

else:

detail['year']="未知"

detail['location'] = position_info[1]

# 650万，匹配650

price_pattern = re.compile(r'\d+')

total_price = li.select('div[class="totalPrice"]')[0].get_text()

detail['total_price'] = re.search(price_pattern, total_price).group()

# 单价64182元/平米，匹配64182

unit_price = li.select('div[class="unitPrice"]')[0].get_text()

detail['unit_price'] = re.search(price_pattern, unit_price).group()

self.data.append(detail)

print(self.data)

return(self.data)

url="https://sh.lianjia.com/ershoufang/minhang/l2p1/"

spider=HomeLinkSpider(url)

#spider.parse_page()

print(len(spider.parse_page()))

weixin_39628864

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

评论

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。