【爬虫实战】【微信】如何利用python爬取APP评论-本文以ios爬取微信评论

最新推荐文章于 2024-06-24 16:06:48 发布

wifi连不上

最新推荐文章于 2024-06-24 16:06:48 发布

阅读量1k

点赞数 2

分类专栏： python 文章标签： python

python 专栏收录该内容

28 篇文章 2 订阅

订阅专栏

# -*- coding: utf-8 -*-
"""
Created on Tue Jul  2 16:26:50 2019

@author: wuxian
"""

#完整程序如下：
import requests
import re

def getHTMLText(url):
    try:
        r = requests.get(url)
        r.raise_for_status()
        #r.encoding = r.apparent_encoding
#        print(r.encoding)
        return r.text
    except:
        return ''

def printAPPName(html):
    try:
        pattern = re.compile(r'{"im:name":{"label":(.*?)}, "rights"', re.S)
        #如果不使用re.S参数，则只在每一行内进行匹配，如果一行没有，就换下一行重新开始，不会跨行。
        #而使用re.S参数以后，正则表达式会将这个字符串作为一个整体，将“\n”当做一个普通的字符加入到这个字符串中，在整体中进行匹配
        APPName = re.findall(pattern, str(html))
        return 'APPName:' + str(APPName)
    except:
        return ''

def fillUnivlist(titles, comments, stars, html):
    try:
        pattern = re.compile(r'"title":{"label":(.*?)}, "content"', re.S) #提取标题
        nbaInfo = re.findall(pattern, str(html)) #提取title

        # findStr = '"title":{"label":'
        # nbaInfo = nbaInfo1[nbaInfo1.find(findStr)+len(findStr):]
        patternFloor = re.compile(r'"content":{"label":(.*?), "attributes":{"type":"text"}}', re.S) #提取content
        floorText = re.findall(patternFloor, str(html))

        patternStar = re.compile(r'"im:rating":{"label":(.*?)}, "id"', re.S)  # 提取星级
        star = re.findall(patternStar, str(html))
        # print(str(star))

        number = len(nbaInfo)
        print(number)
        for i in range(number):
            Info = nbaInfo[i] #利用Tools类移除不想要的格式字符
            if i==0:Info = Info[Info.find('"title":{"label":')+len('"title":{"label":'):]
            # print(Info)
            Info1 = floorText[i]
            Info2 = star[i]
            # print(Info2+"hello")
            titles.append('title:' + Info)
            comments.append('content:' + Info1)
            stars.append('star:' + Info2)
    except:
        return ''

def writeText(titleText, fpath):
    try:
        with open(fpath, 'a', encoding='utf-8') as f:
            f.write(str(titleText)+'\n')
            f.write('\n')
            f.close()
    except:
        return ''

def writeUnivlist(titles, comments, stars, fpath, num):
    with open(fpath, 'a', encoding='utf-8') as f:
        for i in range(num):
            f.write(str(stars[i]) + '\n')
            f.write('*' * 10 + '\n')
            f.write(str(titles[i]) + '\n')
            f.write('*' * 50 + '\n') #输入一行*号
            f.write(str(comments[i]) + '\n')
            f.write('*' * 100 + '\n')
        f.close()

def main():
    count = 0
#    https://itunes.apple.com/cn/app/id1469526398?l=zh&ls=1&mt=8
#    https://itunes.apple.com/rss/customerreviews/page=1/id=414478124/sortby=mostrecent/json?l=en&&cc=cn
    url = 'https://itunes.apple.com/rss/customerreviews/page=1/id=414478124/sortby=mostrecent/json?l=en&&cc=cn' #要访问的网址
    output_file = r'C:\Users\wuxian\Desktop\doing\COCOK.txt' #最终文本输出的文件
    html = getHTMLText(url) #获取HTML
    print(html)
    APPName = printAPPName(html)
#    print("===========%s" %  html.encode(encoding='utf-8').encode(encoding='utf-8'))
    writeText(APPName, output_file)
    for i in range(10):
        i = i + 1
        titles = []
        comments = []
        stars = []
        url = 'https://itunes.apple.com/rss/customerreviews/page=' + str(i) + '/id=414478124/sortby=mostrecent/json?l=en&&cc=cn'
        html = getHTMLText(url)
        print(html)
        fillUnivlist(titles, comments, stars, html)
        writeUnivlist(titles, comments, stars, output_file, len(titles))
        count = count + 1
        print("\r当前进度: {:.2f}%".format(count * 100 / 10), end="")

if __name__ == '__main__':
    main()

wifi连不上

关注

2
点赞
踩
6

收藏

觉得还不错? 一键收藏
0
评论
【爬虫实战】【微信】如何利用python爬取APP评论-本文以ios爬取微信评论

# -*- coding: utf-8 -*-"""Created on Tue Jul 2 16:26:50 2019@author: wuxian"""#完整程序如下：import requestsimport redef getHTMLText(url): try: r = requests.get(url) r.raise_f...
复制链接

扫一扫

专栏目录