获取微医评论数据

14 篇文章 0 订阅
12 篇文章 1 订阅

简单获取微医的数据,只公开60页数据,下一页的time和sign能在当前页面获取到

就诊医生需要注意,分2种,一直是有a标签包裹,一直直接是span标签包裹,需要对每个评论单独处理

今天就不写存数据库,和多线程了~,

import requests
import re
import time
import random
from lxml import etree


class WeiYi(object):
    def __init__(self):
        """初始化"""
        self.pattern_val = re.compile('name="sign".*?value="(.*?)"')
        self.pattern_time = re.compile('name="timestamp".*?value="(.*?)"')
        self.doctor_name = re.compile('.*(\w)')
        self.url = "https://www.guahao.com/commentslist/h-125336754304601000/1-0"
        self.data = "F98CADB881686C6E7DE0D78711CD6739385CC337DAEBF347934BDDB13C018E76415AE203054FBA9A"
        self.data1 = "1527943007685"
        self.headers = {
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "zh-CN,zh;q=0.9",
            "cache-control": "max-age=0",
            "cookie": "_sid_=15224054212110651956551; _fp_code_=b99b3fcb2c173a2719f2157a54bcdbf3; _ipgeo=province%3A%E6%B9%96%E5%8C%97%7Ccity%3A%E6%AD%A6%E6%B1%89; _e_m=1527939221005; Hm_lvt_5697507823ecd633819db0771bb99cfb=1527939225,1527939251; _sh_ssid_=1527941695607; monitor_sid=9; mst=1527941697466; JSESSIONID=1l7g9ke10cd2t1xff1vfkifrfh; __usx__=""; __wyt__=!PVH7BIjcEnE1srkCUQ6GNbCt3O1D_EWqSk-HZOR6OraaJo1L9mCfVxYwySALHvu2K_P8lESrBnnIhXAjFPbz63ZiO_hIA4wInPOvtC3P8AD45XUMJG-6ChkCHOE2AKqRBSOxlcvtaJIqbiFW4AVuHXqvrm3jn8kkGYW9RfupMxrGI; _ci_=QgAHlxJ/u9Dbai4pGj1pAHgiYzjGt3Y20M0OwI/o+XUh5jsYkgZUR6s5/nFJXPhE; __i__=zOnom529CLgbUB/iEwNh1O7QojaVerY0y0UdHwxmNuU=; __uiu__=Qvh+0aIbdN8GEF9A6ypvhrvJ+BIz3RNey3ofkYSMf5Orm+bShvXDfA==; __up__=LpYLUy6TQ7+JzbTK2mU3H/Oc1XD2s9Wba2izLueDk1E=; _exp_=LyRgvLkJK27tn50gWWifp3FdKjsg8CiAiZ43l1RnnpE=; __p__=4/qTaUSM35f8C1pDIT7jcY3KWXX3F4yn8EQmtblko2Y4UMmKvHZ3gg==; __rf__=OWQfOTal7lck2lXGICUgbVb582rFAz6EAAT6csy3CNyHl1Thsqq6ZzwFelhTGmm4vQfIXwdE7w+ajGlWSp68hGdlS9eLY8N/jqeNOc8RrnWMv6suCWzdU+NmMoeXkfHBKjFAR4F/Pqo=; monitor_seq=24; mlt=1527943229308; Hm_lpvt_5697507823ecd633819db0771bb99cfb=1527943229",
            "referer": "https://www.guahao.com/commentslist/h-125336754304601000/1-0?pageNo=11&sign=F98CADB881686C6E7DE0D78711CD6739385CC337DAEBF347934BDDB13C018E76CABBF37CB7B6B10D&timestamp=1527942843278",
            "upgrade-insecure-requests": "1",
            "user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36",
        }
        self.params = {
            "pageNo": "1",
            "sign": self.data,
            "timestamp": self.data1,
        }

    def run(self):
        ssion = requests.session()
        end_url = ""
        for i in range(1, 5):
            time.sleep(random.uniform(1, 3))
            if i == 1:
                response = ssion.get(url=self.url, headers=self.headers, params=self.params)
                end_url = response.url
            else:
                headers = {
                    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                    "accept-encoding": "gzip, deflate, br",
                    "accept-language": "zh-CN,zh;q=0.9",
                    "cache-control": "max-age=0",
                    "cookie": "_sid_=15224054212110651956551; _fp_code_=b99b3fcb2c173a2719f2157a54bcdbf3; _ipgeo=province%3A%E6%B9%96%E5%8C%97%7Ccity%3A%E6%AD%A6%E6%B1%89; _e_m=1527939221005; Hm_lvt_5697507823ecd633819db0771bb99cfb=1527939225,1527939251; _sh_ssid_=1527941695607; monitor_sid=9; mst=1527941697466; JSESSIONID=1l7g9ke10cd2t1xff1vfkifrfh; __usx__=""; __wyt__=!PVH7BIjcEnE1srkCUQ6GNbCt3O1D_EWqSk-HZOR6OraaJo1L9mCfVxYwySALHvu2K_P8lESrBnnIhXAjFPbz63ZiO_hIA4wInPOvtC3P8AD45XUMJG-6ChkCHOE2AKqRBSOxlcvtaJIqbiFW4AVuHXqvrm3jn8kkGYW9RfupMxrGI; _ci_=QgAHlxJ/u9Dbai4pGj1pAHgiYzjGt3Y20M0OwI/o+XUh5jsYkgZUR6s5/nFJXPhE; __i__=zOnom529CLgbUB/iEwNh1O7QojaVerY0y0UdHwxmNuU=; __uiu__=Qvh+0aIbdN8GEF9A6ypvhrvJ+BIz3RNey3ofkYSMf5Orm+bShvXDfA==; __up__=LpYLUy6TQ7+JzbTK2mU3H/Oc1XD2s9Wba2izLueDk1E=; _exp_=LyRgvLkJK27tn50gWWifp3FdKjsg8CiAiZ43l1RnnpE=; __p__=4/qTaUSM35f8C1pDIT7jcY3KWXX3F4yn8EQmtblko2Y4UMmKvHZ3gg==; __rf__=OWQfOTal7lck2lXGICUgbVb582rFAz6EAAT6csy3CNyHl1Thsqq6ZzwFelhTGmm4vQfIXwdE7w+ajGlWSp68hGdlS9eLY8N/jqeNOc8RrnWMv6suCWzdU+NmMoeXkfHBKjFAR4F/Pqo=; monitor_seq=24; mlt=1527943229308; Hm_lpvt_5697507823ecd633819db0771bb99cfb=1527943229",
                    "referer": end_url,
                    "upgrade-insecure-requests": str(i),
                    "user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36",
                }
                params = {
                    "pageNo": str(i),
                    "sign": self.data,
                    "timestamp": self.data1,
                }
                response = ssion.get(url=self.url, headers=headers, params=params)
                end_url = response.url
            # print(end_url)
            html = response.text
            self.data = self.pattern_val.findall(html)
            self.data1 = self.pattern_time.findall(html)
            print(self.data[0])
            print(self.data1[0])
            self.extract(html)
            break

    def extract(self, html):
        """因为有的医生是span,有的是在span下面,就只能对每个医生单独操作"""
        xhtml = etree.HTML(html)
        xhtml = xhtml.xpath('//*[@id="comment-list"]/li')

        for i in range(1, 6):

            name = xhtml[0].xpath('//li[%s]/div[1]/p/text()' % i)
            # print(name[0].strip())
            name = name[0].strip()
            grade = xhtml[0].xpath('//li[%s]/div[2]/p[2]/span' % i)
            # 计算出现的星号次数
            grade_num = len(grade)
            # print(grade_num)

            try:
                doctor = xhtml[0].xpath('//li[%s]/div[3]/div[2]/p/span[2]/a/text()' % i)[0]
            except Exception as e:
                doctor = xhtml[0].xpath('//li[%s]/div[3]/div[2]/p/span[2]/text()' % i)
                # print(doctor[0])
                temp = str(doctor[0])
                doctor = self.doctor_name.findall(str(temp), re.S)
                # print(temp)
                # print(doctor)

            # print(doctor[0].strip())
            comment = xhtml[0].xpath('//li[%s]/div[3]/div[1]/span/text()' % i)
            # print(comment[0].strip())
            time = xhtml[0].xpath('//li[%s]/div[3]/div[2]/p/span[1]/text()' % i)
            # print(time[0].strip())
            self.save(name, grade_num, doctor, comment, time)

    def save(self, name, grade_num, doctor, comment, time):
        print(name[0].strip())
        print(grade_num)
        print(doctor[0].strip())
        print(comment[0].strip())
        print(time[0].strip())



# with open("微医.html", "w", encoding="utf-8") as f:
#     f.write(response.content.decode("utf-8"))

if __name__ == '__main__':
    weiyi = WeiYi()
    weiyi.run()

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值