榛果美团登录爬虫 requests session

最新推荐文章于 2022-06-29 22:34:04 发布
伏地僧
最新推荐文章于 2022-06-29 22:34:04 发布
阅读量949
点赞数
本文链接：https://blog.csdn.net/zhiwei_bian/article/details/102473593
版权
所有美团方面旗下的登陆都采用重定向来解决登陆问题
即利用session 对话来解决登陆问题
当然也可以每次都模拟他的cookie来进行登陆
我用的代理是阿布云代理你们也可以选择别代理
这次是爬取的美团旗下的榛果民宿
  1 import requests
  2 from urllib.parse import urlencode
  3 import json
  4 import time, datetime
  5 import logging
  6 from lxml import etree
  7 import pymysql
  8 from pymysql.err import IntegrityError
  9 
 10 proxies_ = {
 11     'http': '@http-dyn.abuyun.com:9020',
 12     'https': '@http-dyn.abuyun.com:9020',
 13 }
 14 headers = {
 15     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36 OPR/48.0.2685.52'
 16 }
 17 session = requests.Session()
 18 
 19 
 20 def session_get(url, header=headers, tab=12):
 21     if tab == 0:
 22         return False
 23     try:
 24         response = session.get(url, headers=header, proxies=proxies_)
 25         time.sleep(2)
 26         return response if response.status_code == 200 else session_get(url, header, tab - 1)
 27     except Exception as e:
 28         if tab == 1:
 29             logging.exception(e)
 30         return session_get(url, header, tab - 1)
 31 
 32 
 33 def session_post(url, header=headers, data=None, tab=12):
 34     if tab == 0:
 35         return False
 36     try:
 37         response = session.post(url, headers=header, data=data, proxies=proxies_)
 38         time.sleep(2)
 39         return response if response.status_code == 200 else session_post(url, header, data, tab - 1)
 40     except Exception as e:
 41         if tab == 1:
 42             logging.exception(e)
 43         return session_post(url, header, data, tab - 1)
 44 
 45 
 46 def get_node_text(node, xpath):
 47     """
 48     通过节点和xpath来获取节点需要的内容
 49     :param node:
 50     :param xpath:
 51     :return:
 52     """
 53     try:
 54         if xpath == "string(.)": return node.xpath('string(.)').strip()
 55         if len(node.xpath(xpath)) > 0:
 56             return node.xpath(xpath)[0].strip() if isinstance(node.xpath(xpath)[0], str) else node.xpath(xpath)[0]
 57         return ""
 58     except:
 59         logging.exception('获取xpath %s 出错' % (xpath))
 60         return None
 61 
 62 
 63 def get_youjia_tpp_conn():
 64     """
 65     获取井队数据库连接
 66     :return:
 67     """
 68     return pymysql.connect(host='host', user='user', passwd='passwd', db='db', port=3306,
 69                            charset='utf8')
 70 
 71 
 72 def storage_database_text(data_json, t_name, l_name="youjia_tpp"):
 73     """
 74     非json类型数据存储数据库
 75     :param data_json:
 76     :param t_name:
 77     :param l_name:
 78     :return:
 79     """
 80     now_time = str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
 81     data_list = []
 82     insert_sql = "INSERT INTO " + l_name + "." + t_name + " ("
 83     update_sql = "UPDATE " + l_name + "." + t_name + " SET "
 84     for key in data_json:
 85         update_sql += str(key) + "=%s , "
 86         if str(key) == "id":
 87             id_key = data_json[key]
 88         insert_sql += str(key) + ","
 89     update_sql += "modify_time = '" + str(now_time) + "' where id = '" + str(id_key) + "'"
 90     insert_sql = insert_sql[:-1]
 91     insert_sql += ")VALUES("
 92     for key in data_json:
 93         insert_sql += "%s,"
 94         data_list.append(str(data_json[key]))
 95     insert_sql = insert_sql[:-1]
 96     insert_sql += ");"
 97     # print(update_sql)
 98     # print(insert_sql)
 99     with get_youjia_tpp_conn() as conn:
100         try:
101             print("storage_database_text  insert_sql : ", t_name)
102             conn.execute(insert_sql, tuple(data_list))
103         except IntegrityError:
104             print("storage_database_text  update_sql : ", t_name)
105             conn.execute(update_sql, tuple(data_list))
106         except Exception as msg:
107             logging.exception(msg)
108 
109 
110 def storage_database_json(id_, data_json, j_name, t_name, l_name="youjia_tpp"):
111     """
112     存储json形式至数据库
113     :param id_: id
114     :param data_json: json
115     :param j_name: json的名字
116     :param t_name: 表名
117     :param l_name: 库名
118     :return:
119     """
120     now_time = str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
121     insert_sql = "INSERT INTO " + l_name + "." + t_name + " (`id`,`" + j_name + "`)VALUES(%s,%s);"
122     updatesql = "update " + l_name + "." + t_name + " set `" + j_name + "`=%s , modify_time=%s where id = %s;"
123     # print(updatesql % (data_json, now_time, id_))
124     with get_youjia_tpp_conn() as conn:
125         try:
126             print("storage_database_json  insert_sql : ", t_name)
127             conn.execute(insert_sql, (id_, data_json))
128         except IntegrityError:
129             print("storage_database_json  update_sql : ", t_name)
130             conn.execute(updatesql, (data_json, now_time, id_))
131         except Exception as msg:
132             logging.exception(msg)
133 
134 
135 def pre_login():
136     try:
137         param = {
138             # 'uuid': 'e8514dbe200b4fde9393.1532912269.1.0.0',
139             'service': 'phoenix',
140             'continue': 'https://www.zhenguo.com/auth/authenticated/?continue=/help/trust/',
141         }
142         url = 'https://passport.meituan.com/account/unitivelogin?' + urlencode(param)
143         response = session_get(url=url, header=headers, tab=5)
144         if response.status_code == 200:
145             print("pre_login 成功")
146             return response.text
147         else:
148             return None
149     except ConnectionError as e:
150         print(e.args)
151         print('预登陆出错')
152 
153 
154 def parse_param(html):
155     try:
156         html = etree.HTML(html)
157         csrf = html.xpath('//input[@name="csrf"]/@value')[0]
158         origin = html.xpath('//input[@name="origin"]/@value')[0]
159         fingerprint = html.xpath('//input[@name="fingerprint"]/@value')[0]
160         uuid = html.xpath('//i[@class="form-uuid"]/text()')[0]
161         need_captcha = html.xpath('//div[@class="form-field J-form-field-captcha form-field--captcha"]/@style')[
162             0].replace("display:", "")
163         return (csrf, uuid, need_captcha, origin, fingerprint)
164     except:
165         print('解析csrf,uuid,need_captcha出错')
166 
167 
168 def formal_login(username, password, param):
169     csrf = param[0]
170     uuid = param[1]
171     origin, fingerprint = param[3], param[4]
172     if 1 == 1:
173         captcha_param = {
174             'uuid': uuid,
175         }
176         url = 'https://passport.meituan.com/account/captcha?' + urlencode(captcha_param)
177         print(url)
178         image_resp = session_get(url)
179         with open('C:/Users/admin/Desktop/image/zg.jpg', 'wb') as file:
180             file.write(image_resp.content)
181         captcha = input('需要验证码:')
182     # else:
183     #     captcha = ''
184     url_param = {
185         'uuid': uuid,
186         'service': 'phoenix',
187         'continue': 'https://www.zhenguo.com/auth/authenticated/?continue=/help/trust/',
188     }
189     postdata = {
190         'email': username,
191         'password': password,
192         'captcha': captcha,
193         'origin': origin,
194         'fingerprint': fingerprint,
195         'csrf': csrf
196     }
197     url = 'https://passport.meituan.com/account/unitivelogin?' + urlencode(url_param)
198     try:
199         response = session_post(url, data=postdata, header=headers)
200         if response.status_code == 200:
201             print("登陆成功！")
202             return response.text
203         else:
204             return None
205     except ConnectionError as e:
206         print(e.args)
207         print('登录出错')
208 
209 
210 def parse_token(html):
211     try:
212         html = etree.HTML(html)
213         action_url = html.xpath('//form[@class="J-form mainbox__content"]/@action')[0]
214         token = html.xpath('//input[@name="token"]/@value')[0]
215         expire = html.xpath('//input[@name="expire"]/@value')[0]
216         isdialog = html.xpath('//input[@name="isdialog"]/@value')[0]
217         autologin = html.xpath('//input[@name="autologin"]/@value')[0]
218         csrf = html.xpath('//*[@id="csrf"]/text()')[0]
219 
220         # headers['x-csrf-token'] = csrf
221         # trust_response = session.post(action_url, data=postdata, headers=headers)
222         # print(trust_response.text)
223         return {"action_url": action_url, "token": token, "expire": expire, "isdialog": isdialog,
224                 "autologin": autologin, "csrf": csrf}
225     except:
226         logging.exception('解析token出错')
227 
228 
229 def redirect_login(token_json):
230     """
231     {"action_url": action_url, "token": token, "expire": expire, "isdialog": isdialog,
232                 "autologin": autologin, "csrf": csrf}
233     :param token:
234     :return:
235     """
236     postdata = {
237         'token': token_json['token'],
238         'expire': token_json['expire'],
239         'isdialog': token_json['isdialog'],
240         'autologin': token_json['autologin'],
241         'logintype': 'normal'
242     }
243     headers['x-csrf-token'] = token_json['csrf']
244     try:
245         trust_response = session_post(token_json['action_url'], data=postdata, header=headers)
246         print("重定向成功！！")
247         # tt = session.get("https://www.zhenguo.com/house/list/", headers=t_h)
248     except ConnectionError as e:
249         print(e.args)
250         print('重定向出错')
251 
252 
253 def test():
254     try:
255         time.sleep(5)
256         url = 'http://maoyan.com/profile'
257         response = session_get(url, header=headers)
258         print(response.status_code)
259         print(response.text)
260     except ConnectionError as e:
261         print(e.args)
262         print('测试出错')
263 
264 
265 def crawl_order(account_id, token, page_no=1, page_size=20):
266     orders_url = "https://www.zhenguo.com/host/orders/"
267     response = session_get(orders_url, header=headers)
268     print(response.status_code)
269     html = etree.HTML(response.text)
270     csrf = html.xpath('//meta[@name="csrf-token"]/@content')[0]
271     headers['x-csrf-token'] = csrf
272     print(csrf)
273     queryOrderByTypeUrl = "https://www.zhenguo.com/gw/order/api/v1/orderSearch/queryOrderByType"
274     OrderByType = {'pageNow': page_no, 'pageSize': page_size, 'orderStatusType': 9}
275     headers['Accept'] = "application/json"
276     headers['Content-Type'] = "application/json"
277     query_response = session_post(queryOrderByTypeUrl, data=json.dumps(OrderByType), header=headers)
278     query_json = query_response.json()
279     query_list = query_json['data']['list']
280     print(len(query_list))
281     for order_json in query_list:
282         order_id = order_json['orderId']
283         storage_database_json(order_id, json.dumps(order_json), 'order', 'zhenguo_order')
284         storage_database_text({"id": order_id, 'account_id': account_id}, 'zhenguo_order')
285 
286     if len(query_list) == page_size:
287         crawl_order(account_id, page_no + 1)
288 
289 
290 def house_detail(list_json):
291     """
292     解析房屋详情的
293     :param list_json:
294     :return:
295     """
296     room_id = list_json["id"]
297     room_url = "https://www.zhenguo.com/housing/%s" % room_id
298     room_response = session_get(room_url)
299     if room_response:
300         html = etree.HTML(room_response.text)
301         room_type = get_node_text(html,
302                                   '//*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[1]/div[2]/span[1]/text()')
303         list_json["room_type"] = room_type
304         house_wear = get_node_text(html,
305                                    '//*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[1]/div[2]/span[2]/text()')
306         list_json["house_wear"] = house_wear
307         room_area = get_node_text(html,
308                                   '//*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[1]/div[2]/span[3]/text()')
309         list_json["room_area"] = room_area
310         for node in html.xpath('//*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[2]/ul/li'):
311             text = get_node_text(node, './div[1]/text()')
312             node_detail = get_node_text(node, './div[2]/text()')
313             if text == "房源":
314                 room_count = node_detail
315                 list_json["room_count"] = room_count
316             if text == "评价":
317                 comment_count = node_detail
318                 list_json["comment_count"] = comment_count
319             if text == "咨询回复率":
320                 rep_rate = node_detail
321                 list_json["rep_rate"] = rep_rate
322             if text == "咨询回复时长":
323                 rep_length = node_detail
324                 list_json["rep_length"] = rep_length
325         str(1).strip()
326         reserve = get_node_text(html, '//*[@id="J-layout"]/div[2]/div/'
327                                       'div[2]/div/div[2]/section[8]/ul[1]/li[2]/text()').split("，")
328         # list_json["reserve"] = reserve
329         if len(reserve) > 1:
330             less_day = reserve[0].replace("最少预订", "").replace("天", "").strip()
331             more_day = reserve[1].replace("最多预订", "").replace("天", "").strip()
332             list_json["less_day"] = less_day
333             list_json["more_day"] = more_day
334         unsubscribe = get_node_text(html, '//*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[8]/ul[2]/li/text()')
335         list_json["unsubscribe"] = unsubscribe
336     return list_json
337 
338 
339 def crawl_room(account_id, token):
340     comment_url = "https://www.zhenguo.com/gw/ugc/api/v1/product/comments?productId=%s&pageNow=1&pageSize=100"
341     room_list_url = "https://www.zhenguo.com/house/list/"
342     room_response = session_get(url=room_list_url, header=headers)
343     if room_response:
344         html = etree.HTML(room_response.text)
345     for node in html.xpath('//div[@class="houseCard__block"]'):
346         title = get_node_text(node, './div[@class="houseCard__titleLine"]/text()')  # 标题
347         price = get_node_text(node, './div[@class="houseCard__addLine clearfix"]'
348                                     '/span[1]/span[@class="houseCard__price"]/text()').replace("¥", "")  # 价格
349         state = get_node_text(node, './div[@class="houseCard__bottomLine clearfix"]/'
350                                     'div[1]/span[@class="houseCard__verifyStatus-5"]/text()')  # 状态
351         room_id = get_node_text(node, './div[@class="houseCard__bottomLine clearfix"]'
352                                       '/div[1]/@data-product-id')  # 房源id
353         print(account_id, title, price, state, room_id)
354         list_json = {"account_id": account_id, "title": title,
355                      "price": price, "state": state, "id": room_id, "room_id": room_id}
356         comment_ = comment_url % room_id
357 
358         house_json = house_detail(list_json)
359         response = session_get(url=comment_)
360         if response:
361             print(response.text)
362             storage_database_json(room_id, json.dumps(response.json()), "comment", "zhenguo_room_info",
363                                   l_name="youjia_tpp")
364         storage_database_text(house_json, 'zhenguo_room_info')
365 
366 
367 def crawl_room_list(account_id, token):
368     app_header = {"User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; vivo X7 Build/LMY47V) AppleWebKit/537.36 "
369                                 "(KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36 TitansX/11.6.12 "
370                                 "KNB/1.2.0 android/5.1.1 phoenix/com.meituan.phoenix/2.6.0 com.meituan.phoenix/2.6.0",
371                   "Cookie": "token=" + token}
372     list_url = "https://iphx.meituan.com/ds/product/online/list"
373     list_resp = session_get(url=list_url, header=app_header)
374     if list_resp:
375         list_json = list_resp.json()
376         for room_json in list_json['data']['list']:
377             room_id = room_json['productId']
378             product_quota_url = "https://iphx.meituan.com/api/product/api/v1/product/getProductQuota/"+str(room_id)
379             product_quota_resp = session_get(url=product_quota_url, header=app_header)
380             print(room_json)
381             print(product_quota_resp.json()['data'])
382 
383 
384 
385 def crawl(account_id, token):
386     """
387     登录的session搞定之后 开始爬取详细信息
388     :return:
389     """
390     crawl_room_list(account_id, token)  # 爬取手机端信息
391 
392     # crawl_room(account_id, token)  # 房屋爬取
393     # crawl_order(account_id, token)  # 订单爬虫
394 
395 
396 def login(username, password):
397     html_pre_login = pre_login()
398     param = parse_param(html_pre_login)
399     print("param: ", param)
400     html_login = formal_login(username, password, param)
401     # print(html_login)
402     token_json = parse_token(html_login)
403     print("token_json: ", token_json)
404     redirect_login(token_json)
405     return token_json['token']
406 
407 
408 if __name__ == '__main__':
409     username = 'username'
410     password = 'username'
411     token = login(username, password)
412     crawl(1, token)
伏地僧
关注
0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
榛果美团登录爬虫 requests session

所有美团方面旗下的登陆都采用重定向来解决登陆问题即利用session 对话来解决登陆问题当然也可以每次都模拟他的cookie来进行登陆我用的代理是阿布云代理你们也可以选择别代理这次是爬取的美团旗下的榛果民宿 1 import requests 2 from urllib.parse import urlencode 3 import...
复制链接

扫一扫