完成了在携程网站爬取指定城市的特色美食的功能。
具体方式是输入用户所选择的城市,然后就可以返回一系列的特色美食的信息。
首先是爬取携程的所有城市的ID,并把它保存在一个列表中,用户指定城市后就可以直接从中取出,然后访问到指定城市的页面。
获取城市ID:
def getCityList(self):
html = self.getHtml('https://piao.ctrip.com/ticket/?districtid=1')
soup = BS(html, "html.parser")
script = soup.find_all('script')
i = 0
for s in script:
if i == 4:
#cities = json.loads(s.text)
#break
text = s.text.replace(' ','').replace('\n','')
text = text[text.find('window.__INITIAL_STATE__')+25:text.find('window.__APP_SETTINGS__=')]
cities = json.loads(text)
city = cities['citiesData']['domesticcity']['cities']
for c in city:
for ci in c['cities']:
self.cityList[ci['name']] = ci['id']
break
i = i+1
def getCityID(self,city):
if len(self.cityList) == 0:
self.getCityList()
result = process.extractBests(city, self.cityList.keys(), score_cutoff=80, limit=1)
return self.cityList[result[0][0]]
然后再对城市页面进行爬取,获得各个美食的详细介绍的页面,再将其的介绍和图片的链接保存起来即可:
def getXC(self, city):
url = self.base + self.getCityUrl(city) + ".html"
p = 1
l = []
count = 1;
while p != 3:
tmp = self.base + self.getCityUrl(city) + "/s0-p" + str(p) + ".html"
html = self.getHtml(tmp)
soup = BS(html, 'html.parser')
vs = soup.find_all(name="div", attrs={"class": "rdetailbox"})
print("len(vs)", len(vs))
for j in range(len(vs)):
try:
# 获取子网页链接地址
href = vs[j].find(name="a", attrs={"target": "_blank"}).attrs["href"];
# print("href",href)
# 再次请求子网页,获取景点详细信息
res = self.getHtml(self.base2 + href)
soupi = BS(res, "html.parser") # 该网页的html代码
vis = soupi.find_all(name="li", attrs={"class": "infotext"});
introduce = []
for i in range(len(vis)):
introduce.append(vis[i].get_text())
imgs = [];
imglinks = soupi.find_all(name="a", attrs={"href": "javascript:void(0)"})
tmp = {};
tmp["id"] = count;
tmp["name"] = vs[j].find(name="a", attrs={"target": "_blank"}).string;
tmp["name"] = tmp["name"].replace(" ", "").replace("\n", "");
tmp["introduce"] = introduce
tmp["img"] = imglinks
tmp["city"] = city
count = count + 1;
l.append(tmp);
#time.sleep(1);
except Exception as e:
print(e)
pass
p = p+1
# print(l)
for i in l:
print((i))
后续将继续爬取多个平台上的门票信息,以进行对比。