python爬取58同城房子发布的手机号码_python3爬虫-爬取58同城上所有城市的租房信息...

最新推荐文章于 2023-05-16 15:00:42 发布

weixin_39664477

最新推荐文章于 2023-05-16 15:00:42 发布

阅读量856

点赞数

文章标签： python爬取58同城房子发布的手机号码

这是一个使用Python实现的58同城租房信息爬虫，它可以遍历所有城市的租房信息。爬虫首先获取所有城市及其缩写，然后针对每个城市，抓取不同类型的租房页面，如个人房源、安选房源、经纪人等。通过处理页面中的加密数据，爬虫能够解析并保存房源的图片、标题、详情、地址、标签和价格等信息。数据保存到MySQL数据库中，也可选择保存为文件。

摘要由CSDN通过智能技术生成

from fake_useragent importUserAgentfrom lxml importetreeimportrequests, osimporttime, re, datetimeimportbase64, json, pymysqlfrom fontTools.ttLib importTTFont

ua=UserAgent()classCustomException(Exception):def __init__(self, status, msg):

self.status=status

self.msg=msgclassCity_58:'''58同城的爬虫类，目前就写这两个

出租房url: https://cd.58.com/chuzu/ cd代表成都缩写

二手房url: https://cd.58.com/ershoufang/'''font_dict={"glyph00001": "0","glyph00002": "1","glyph00003": "2","glyph00004": "3","glyph00005": "4","glyph00006": "5","glyph00007": "6","glyph00008": "7","glyph00009": "8","glyph00010": "9",

}

conn=Nonedef __init__(self):

self.session=requests.Session()

self.session.headers={"user-agent": ua.random

}

self.__init__all_city()def __init__all_city(self):'''获取所有城市的名字及缩写的对应关系'''api= "https://www.58.com/changecity.html"headers=self.session.headers.copy()

response= self.session.get(api, headers=headers)

html=response.text

res= re.findall("cityList = (.*?)", html, re.S)[0]

res= re.sub("\s", "", res)

dic=json.loads(res)for k, v indic.items():for k1, v1 inv.items():

dic[k][k1]= v1.split("|")[0]

city_dict={}deftraverse_dict(dic: dict):for k, v indic.items():if k == "海外" or k == "其他":continue

ifisinstance(v, dict):

traverse_dict(v)

city_dict[k]=v

traverse_dict(dic)

other_city= re.findall("independentCityList = (.*?)var", html, re.S)[0]

res= re.sub("\s", "", other_city)

other_city_dic=json.loads(res)for k, v inother_city_dic.items():

other_city_dic[k]= v.split("|")[0]

city_dict.update(other_city_dic)

self.all_city_dict=city_dictdef spider_zufang(self, city: str = "成都", is_get_all: bool =True):'''爬取租房信息的爬虫方法'''

assert self.all_city_dict is not None, "获取所有城市信息失败 !"format_city=self.all_city_dict.pop(city, None)assert format_city is not None, "{}该城市不在爬取城市之内".format(city)whileTrue:

self.city=city#self.file = open("./house_info.json", "a", encoding="utf-8")

start_url = self.__init_zufang(format_city)#思路是什么，首先进入区域的租房页面，在该页面中先提取出相应的title，比如经纪人，个人房源等等...

#我们需要构建出相应的url就可以了

#start_url的格式为 https://cd.58.com/chuzu/ 我们需要转为这样的格式 https://cd.58.com/jintang/hezu/

#我们访问转化后的地址，再拿去到相应的链接，比如经纪人，个人房源等链接

#拿到该链接以后，这就是这个分类里的第一页url，我们再对这个链接发生请求，

#拿到响应体，这里可以写一个while循环，因为我们不知道有多少页，其实也可以知道有多少页，就是在这个响应体中可拿到

#我的思路就是写一个while循环，判断是否有下一页，有的继续，没有的话直接break

for url_info_list in self.__get_url(start_url):#这里的话，最好进行判断一下，因为每个title(值个人房源，品牌公寓等..)不一样的话,可能爬取的策略也不太一样

title = url_info_list[1]if title in ["个人房源", "安选房源", "经纪人", "热租房源"] or "出租" intitle:

最低0.47元/天解锁文章

weixin_39664477

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫