from fake_useragent importUserAgentfrom lxml importetreeimportrequests, osimporttime, re, datetimeimportbase64, json, pymysqlfrom fontTools.ttLib importTTFont
ua=UserAgent()classCustomException(Exception):def __init__(self, status, msg):
self.status=status
self.msg=msgclassCity_58:'''58同城的爬虫类,目前就写这两个
出租房url: https://cd.58.com/chuzu/ cd代表成都缩写
二手房url: https://cd.58.com/ershoufang/'''font_dict={"glyph00001": "0","glyph00002": "1","glyph00003": "2","glyph00004": "3","glyph00005": "4","glyph00006": "5","glyph00007": "6","glyph00008": "7","glyph00009": "8","glyph00010": "9",
}
conn=Nonedef __init__(self):
self.session=requests.Session()
self.session.headers={"user-agent": ua.random
}
self.__init__all_city()def __init__all_city(self):'''获取所有城市的名字及缩写的对应关系'''api= "https://www.58.com/changecity.html"headers=self.session.headers.copy()
response= self.session.get(api, headers=headers)
html=response.text
res= re.findall("cityList = (.*?)", html, re.S)[0]
res= re.sub("\s", "", res)
dic=json.loads(res)for k, v indic.items():for k1, v1 inv.items():
dic[k][k1]= v1.split("|")[0]
city_dict={}deftraverse_dict(dic: dict):for k, v indic.items():if k == "海外" or k == "其他":continue
ifisinstance(v, dict):
traverse_dict(v)
city_dict[k]=v
traverse_dict(dic)
other_city= re.findall("independentCityList = (.*?)var", html, re.S)[0]
res= re.sub("\s", "", other_city)
other_city_dic=json.loads(res)for k, v inother_city_dic.items():
other_city_dic[k]= v.split("|")[0]
city_dict.update(other_city_dic)
self.all_city_dict=city_dictdef spider_zufang(self, city: str = "成都", is_get_all: bool =True):'''爬取租房信息的爬虫方法'''
assert self.all_city_dict is not None, "获取所有城市信息失败 !"format_city=self.all_city_dict.pop(city, None)assert format_city is not None, "{}该城市不在爬取城市之内".format(city)whileTrue:
self.city=city#self.file = open("./house_info.json", "a", encoding="utf-8")
start_url = self.__init_zufang(format_city)#思路是什么,首先进入区域的租房页面,在该页面中先提取出相应的title,比如经纪人,个人房源等等...
#我们需要构建出相应的url就可以了
#start_url的格式为 https://cd.58.com/chuzu/ 我们需要转为这样的格式 https://cd.58.com/jintang/hezu/
#我们访问转化后的地址,再拿去到相应的链接,比如经纪人,个人房源等链接
#拿到该链接以后,这就是这个分类里的第一页url,我们再对这个链接发生请求,
#拿到响应体,这里可以写一个while循环,因为我们不知道有多少页,其实也可以知道有多少页,就是在这个响应体中可拿到
#我的思路就是写一个while循环,判断是否有下一页,有的继续,没有的话直接break
for url_info_list in self.__get_url(start_url):#这里的话,最好进行判断一下,因为每个title(值个人房源,品牌公寓等..)不一样的话,可能爬取的策略也不太一样
title = url_info_list[1]if title in ["个人房源", "安选房源", "经纪人", "热租房源"] or "出租" intitle: