python定向爬取_Python---定向爬取网页的内容

最新推荐文章于 2021-01-06 13:45:48 发布

weixin_39652646

最新推荐文章于 2021-01-06 13:45:48 发布

阅读量229

点赞数

文章标签： python定向爬取

定向爬取网页的内容

1.所谓定向爬取网页，我们首先要知道自己需要干什么，明确目标。

2.根据目标过滤自己的网址，说白了就是要找到自己想要目标的关键地址。比如网址中加载更多，这样的接口是没法浏览器上显示的。这时候我们就需要抓包工具fiddler、charles等

3.根据抓去的网址的加载信息，从信息中过滤自己想要的内容。

例子：抓取腾讯视频中视频评论信息。

import ssl

def getComment(url):

head = {"Host": "video.coral.qq.com", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0",

"Accept": "*/*", "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",

"Referer": "https://page.coral.qq.com/coralpage/comment/video.html",

"Connection": "keep-alive"}

cooker = http.cookiejar.CookieJar()

opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cooker))

headll = []

for key, value in head.items():

item = (key, value)

headll.append(item)

opener.addheaders = headll

urllib.request.install_opener(opener)

data = urllib.request.urlopen(url).read().decode('utf-8')

pattrn_id = '"id":"(.*?)"'

pattrn_content = '"content":"(.*?)"'

pattrn_zan = '"up":"(.*?)"'

pattrn_last = '"last":"(.*?)"'

idList = re.compile(pattrn_id, re.S).findall(data)

contentList = re.compile(pattrn_content, re.S).findall(data)

zanList = re.compile(pattrn_zan, re.S).findall(data)

lastList = re.compile(pattrn_last, re.S).findall(data)

print("页面标志："+str(lastList[0]))

for i in range(0, 9):

print("用户名："+str(idList[i]))

//将unicode的字符串进行转码显示

print("评论内容："+eval('u"'+contentList[i]+'"'))

print("点赞数量："+str(zanList[i]))

print("\n")

//https的使用

ssl._create_default_https_context = ssl._create_unverified_context

url = "https://video.coral.qq.com/varticle/2845740235/comment/v2?callback=_varticle2845740235commentv2&orinum=10&6423579606449110299"

getComment(url)

上面的代码只是抓去了第一页的数据，现在继续改进代码，来抓去10页的数据：

# 定义获取每一页数据的标识

cursor = "6421123374681785303"

head = {"Host": "video.coral.qq.com", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0",

"Accept": "*/*", "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",

"Referer": "https://page.coral.qq.com/coralpage/comment/video.html",

"Connection": "keep-alive"}

cookjar = http.cookiejar.CookieJar()

opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookjar))

heads = []

for key, value in head.items():

item = (key, value)

heads.append(item)

opener.addheaders = heads

urllib.request.install_opener(opener)

# 定义函数来抓取每一页的数据，并将数据返回

def getComment(cursor):

ssl._create_default_https_context = ssl._create_unverified_context

url = "https://video.coral.qq.com/varticle/2845740235/comment/v2?callback=_varticle2845740235commentv2&orinum=10&cursor="+cursor

data = urllib.request.urlopen(url).read().decode('utf-8')

return data

for i in range(1, 10):

print("抓取第一页数据："+str(i))

data = getComment(cursor)

pattrn_id = '"id":"(.*?)"'

pattrn_content = '"content":"(.*?)"'

pattrn_up = '"up":"(.*?)"'

pattrn_last = '"last":"(.*?)"'

idList = re.compile(pattrn_id, re.S).findall(data)

contentLsit = re.compile(pattrn_content, re.S).findall(data)

upList = re.compile(pattrn_up, re.S).findall(data)

lastList = re.compile(pattrn_last, re.S).findall(data)

for j in range(0, 9):

userId = str(idList[j])

userContent = eval('u"'+contentLsit[j]+'"')

userUp = str(upList[j])

cursor = lastList[0]

print("用户名："+userId)

print("评论内容："+userContent)

print("点赞数量："+userUp)

print("\n")

weixin_39652646

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

评论

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。