# -*- coding:utf-8 -*- # 仅需修改这个地方https://jn.lianjia.com/ershoufang/pg{}rs/ 将jn换成你所在城市的拼写首字母小写 import requests from lxml import etree import time import random import csv import requests import json class LianjiaSpider(object): def __init__(self): self.url = "https://mobilenext-web.meituan.com/api/newSalesBoard/getSaleBoardDetail?cityId={}&boardType=8&districtId=0&cateId={}&offset=0&limit=50" self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1"} def get_page(self, url,i,j): print(url) res = requests.get(url=url, headers=self.headers) res.encoding = "utf-8" html = res.text # if(html == '{"totalSize":0,"saleBoardPoiList":[],"boardDigest":null}'): # html ={"totalSize":50,"saleBoardPoiList":[{"id":0,"name":"","weekSaleCount":"周销量 0","score":0,"avgPrice":0,"cateName":"","areaName":"","distance":"","rank":0,"frontImg":"https://img.meituan.net/msmerchant/","oneSentence":"","saleBoardPoiGroup":null,"saleBoardPoiCoupon":{"icon":"https://p0.meituan.net/travelcube/","content":""},"saleBoardPoiPay":null,"branchList":null}],"boardDigest":null} # print(i) # print(html) # results_temp = html.replace('{"totalSize":50,"saleBoardDealList":', "").replace("}}]}", "") # results = results_temp + "}}]" # print(results) self.parse_page(html,i,j) # print(html) # print(i) def parse_page(self, html,i,j): results = html[35:-20] # print(len(results)) print(results) # print(results.find("[")) if (results.find("[") == -1): prefix = "[" results = prefix + results print(results) print(len(results)) for list in json.loads(results): print(list) id = list["id"] print(id) name = list["name"] print(name) weekSaleCount = list["weekSaleCount"] score = list["score"] avgPrice = list["avgPrice"] cateName = list["cateName"] areaName = list["areaName"] distance = list["distance"] rank = list["rank"] frontImg = list["frontImg"] oneSentence = list["oneSentence"] with open('meituan.csv', 'a', newline='', encoding='utf-8')as f: write = csv.writer(f) write.writerow( [ name, weekSaleCount, score, avgPrice, cateName, areaName, distance, rank, frontImg, oneSentence]) def main(self): #20广州、香港118,165白山,170鹤岗 cityId_lists = [1,10,20,118,165,178] for i in cityId_lists: # 第二个实例 # print(i) for j in range(1, 2): # print(j) time.sleep(random.randint(3, 5)) url = self.url.format(i,j) # print(url) self.get_page(url,i,j) # print(j) if __name__ == '__main__': start = time.time() spider = LianjiaSpider() spider.main() end = time.time() print("执行时间:%.2f" % (end - start))
mt包括空值和totalsize为个位数的数值,缺少全部城市
最新推荐文章于 2024-09-06 22:25:52 发布