Python---定向爬取网页的内容

最新推荐文章于 2020-12-08 23:36:55 发布

weixin_33749242

最新推荐文章于 2020-12-08 23:36:55 发布

阅读量285

点赞数

文章标签： python 爬虫

原文链接：https://my.oschina.net/quguangle/blog/1860237

版权

本文介绍了一种针对腾讯视频网站评论数据的爬虫实现方法。通过使用Python语言结合正则表达式及urllib库，详细展示了如何从指定视频页面抓取用户评论信息，包括用户名、评论内容和点赞数等，并提供了抓取多页数据的完整示例。

摘要由CSDN通过智能技术生成

2019独角兽企业重金招聘Python工程师标准>>>

定向爬取网页的内容

1.所谓定向爬取网页，我们首先要知道自己需要干什么，明确目标。

2.根据目标过滤自己的网址，说白了就是要找到自己想要目标的关键地址。比如网址中加载更多，这样的接口是没法浏览器上显示的。这时候我们就需要抓包工具fiddler、charles等

3.根据抓去的网址的加载信息，从信息中过滤自己想要的内容。

例子：抓取腾讯视频中视频评论信息。

import ssl
def getComment(url):
    head = {"Host": "video.coral.qq.com", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0",
    "Accept": "*/*", "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
    "Referer": "https://page.coral.qq.com/coralpage/comment/video.html",
    "Connection": "keep-alive"}
    cooker = http.cookiejar.CookieJar()
    opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cooker))
    headll = []
    for key, value in head.items():
        item = (key, value)
        headll.append(item)
    opener.addheaders = headll    
    urllib.request.install_opener(opener)
    data = urllib.request.urlopen(url).read().decode('utf-8')
    pattrn_id = '"id":"(.*?)"'
    pattrn_content = '"content":"(.*?)"'
    pattrn_zan = '"up":"(.*?)"'
    pattrn_last = '"last":"(.*?)"'
    idList = re.compile(pattrn_id, re.S).findall(data)
    contentList = re.compile(pattrn_content, re.S).findall(data)
    zanList = re.compile(pattrn_zan, re.S).findall(data)
    lastList = re.compile(pattrn_last, re.S).findall(data)
    print("页面标志："+str(lastList[0]))
    for i in range(0, 9):
        print("用户名："+str(idList[i]))
        //将unicode的字符串进行转码显示
        print("评论内容："+eval('u"'+contentList[i]+'"'))
        print("点赞数量："+str(zanList[i]))
        print("\n")

//https的使用
ssl._create_default_https_context = ssl._create_unverified_context
url = "https://video.coral.qq.com/varticle/2845740235/comment/v2?callback=_varticle2845740235commentv2&orinum=10&6423579606449110299"
getComment(url)

上面的代码只是抓去了第一页的数据，现在继续改进代码，来抓去10页的数据：

# 定义获取每一页数据的标识
cursor = "6421123374681785303"
head = {"Host": "video.coral.qq.com", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0",
    "Accept": "*/*", "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
    "Referer": "https://page.coral.qq.com/coralpage/comment/video.html",
    "Connection": "keep-alive"}
cookjar = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookjar))
heads = []
for key, value in head.items():
    item = (key, value)
    heads.append(item)
opener.addheaders = heads
urllib.request.install_opener(opener)

# 定义函数来抓取每一页的数据，并将数据返回


def getComment(cursor):
    ssl._create_default_https_context = ssl._create_unverified_context
    url = "https://video.coral.qq.com/varticle/2845740235/comment/v2?callback=_varticle2845740235commentv2&orinum=10&cursor="+cursor
    data = urllib.request.urlopen(url).read().decode('utf-8')
    return data


for i in range(1, 10):
    print("抓取第一页数据："+str(i))
    data = getComment(cursor)
    pattrn_id = '"id":"(.*?)"' 
    pattrn_content = '"content":"(.*?)"'   
    pattrn_up = '"up":"(.*?)"'
    pattrn_last = '"last":"(.*?)"'
    idList = re.compile(pattrn_id, re.S).findall(data)
    contentLsit = re.compile(pattrn_content, re.S).findall(data)
    upList = re.compile(pattrn_up, re.S).findall(data)
    lastList = re.compile(pattrn_last, re.S).findall(data)
    for j in range(0, 9):
        userId = str(idList[j])
        userContent = eval('u"'+contentLsit[j]+'"')
        userUp = str(upList[j])
        cursor = lastList[0]
        print("用户名："+userId)
        print("评论内容："+userContent)
        print("点赞数量："+userUp)
        print("\n")

转载于:https://my.oschina.net/quguangle/blog/1860237