pyhton3淘宝爬取评价内容

最新推荐文章于 2024-06-02 16:38:43 发布

18923489164

最新推荐文章于 2024-06-02 16:38:43 发布

阅读量410

点赞数

分类专栏：爬虫

本文链接：https://blog.csdn.net/AnYeZhiYin/article/details/82954842

版权

爬虫专栏收录该内容

117 篇文章 13 订阅

订阅专栏


import urllib.request

import re



def urlopen(url):

    
    req = urllib.request.Request(url)

    req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36")

    html = urllib.request.urlopen(req)

    html = html.read()

    return html



def cont(url):

    html = urlopen(url)

    html = html.decode("utf-8")
    
    biao = re.compile(r'(\U0001f60a)|(\U0001f603)|(\U0001f642)|(\U0001f44d)|(\u2b50)|(\U0001f917)|(\U0001f637)|(\U0001f913)|(\U0001f602)|(\U0001f61b)|(\uff65)|(\U0001f44f)|(\U0001f613)|(\U0001f648)|(\U0001f44e)|(\U0001f44c)')
    #这上面的是表情 代码 应该还有很多 遇到就加上去吧  不然出错 这表情真很头疼

    html = re.sub(biao,'',html)
    #这个代码就是用re.sub 把表情替换成了空白
    htm = html

    aa = re.findall(r'(....年..月..日 ..:..)',htm)
    #提取时间  这个好做

    cc = re.findall(r'(.\*\*\*.)',htm)
    #这个是提取ID  也很有规律

    content = re.findall(r'(content":")(.{0,300})(","rateId)',htm)
    #这个是内容，我用了内容两边的标签 
    cont = []
    for i in content:
        i = i[1]
        cont.append(i)
      #内容用了3个组所以需要把需要的提取出来，
    content = cont

    img =re.findall(r'thumbnail":"//img.alicdn.com/imgextra/.././.+?jpg","url',htm)
    #这是评价图片 已经提取出来 不过我没下载  需要的可以直接用这个

    ff=zip(aa,cc,content)
    #zip() 这个内置函数 就把ID 时间  内容 同时输出 如果列表数量不一样 就按最短的输出

    for i in ff:
        print(i)


        
for i in range(1,60):

    url = 'https://rate.taobao.com/feedRateList.htm?auctionNumId=15088074383&userNumId=44607594189&currentPageNum={}'.format(i)

    print(i)
    
    cont(url)

18923489164

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
pyhton3淘宝爬取评价内容

import urllib.requestimport redef urlopen(url): req = urllib.request.Request(url) req.add_header(&quot;User-Agent&quot;,&quot;Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like...
复制链接

扫一扫