python网络爬虫从入门到实践 第4章
代码:
import requests
link = """https://api-zero.livere.com/v1/comments/list?callback=jQuery112403473268296510956_1531502963311&limit=10&repSeq=4272904&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&_=1531502963313"""
headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
r = requests.get(link, headers= headers)
print (r.text)
格式化之后的返回值:
/**/
typeof jQuery112403473268296510956_1531502963311 === 'function' && jQuery112403473268296510956_1531502963311({
"results": {
"parents": [{
"replySeq": 38480448,
"name": "***",
"memberId": "UID_F6B215D7DEEDEAE9AF81B2D8DB4E1E5F",
"memberIcon": "http://thirdqq.qlogo.cn/g?b=oidb&k=ricQqwkWCnMtF1oFF4D6Isg&s=100&t=1555792070",
"memberUrl": "https://qq.com/",
"memberDomain": "qq",
"good": 0,
"bad": 0,
"police": 0,
"parentSeq": 38480448,
"directSeq": 0,
"shortUrl": null,
"title": "Hello world!",
"site": "http://www.santostang.com/2018/07/04/hello-world/",
"email": null,
"ipAddress": "1.192.165.1",
"isMobile": "0",
"agent": "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36",
"septSns": null,
"targetService": null,
"targetUserName": null,
"info1": null,
"info2": null,
"info3": null,
"image1": null,
"image2": null,
"image3": null,
"link1": null,
"link2": null,
"link3": null,
"isSecret": 0,
"isModified": 0,
"confirm": 0,
"subCount": 0,
"regdate": "2019-08-19T02:51:55.000Z",
"deletedDate": null,
"file1": null,
"file2": null,
"file3": null,
"additionalSeq": 0,
"content": "有点难哦",
"quotationSeq": null,
"quotationContent": null,
"consumerSeq": 1020,
"livereSeq": 28583,
"repSeq": 4272904,
"memberGroupSeq": 31232886,
"memberSeq": 31773506,
"status": 0,
"repGroupSeq": 0,
"adminSeq": 25413747,
"deleteReason": null,
"sticker": 0,
"version": null
}, {
"replySeq": 38470012,
"name": "Sunny",
"memberId": "oBVoaxPUgzCwxoSxypq9Ku9WUY3c",
"memberIcon": "http://thirdwx.qlogo.cn/mmopen/vi_32/Q0j4TwGTfTJdB0PgiagfrjwNpjDujYns8kgLBUn5fYuJDPu5xUT0B3jnbljIrcibHBsp2owe7IbxxdIEgbu2TqXg/132",
"memberUrl": "http://www.wechat.com",
"memberDomain": "wechat",
"good": 0,
"bad": 0,
"police": 0,
"parentSeq": 38470012,
"directSeq": 0,
"shortUrl": null,
"title": "Hello world!",
"site": "http://www.santostang.com/2018/07/04/hello-world/",
"email": null,
"ipAddress": "183.250.210.197",
"isMobile": "0",
"agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"septSns": null,
"targetService": null,
"targetUserName": null,
"info1": null,
"info2": null,
"info3": null,
"image1": null,
"image2": null,
"image3": null,
"link1": null,
"link2": null,
"link3": null,
"isSecret": 0,
"isModified": 0,
"confirm": 0,
"subCount": 0,
"regdate": "2019-08-17T05:09:51.000Z",
"deletedDate": null,
"file1": null,
"file2": null,
"file3": null,
"additionalSeq": 0,
"content": "试试解析真实地址抓取",
"quotationSeq": null,
"quotationContent": null,
"consumerSeq": 1020,
"livereSeq": 28583,
"repSeq": 4272904,
"memberGroupSeq": 31224852,
"memberSeq": 31765380,
"status": 0,
"repGroupSeq": 0,
"adminSeq": 25413747,
"deleteReason": null,
"sticker": 0,
"version": null
}, {
"replySeq": 38469774,
"name": "Sunny",
"memberId": "oBVoaxPUgzCwxoSxypq9Ku9WUY3c",
"memberIcon": "http://thirdwx.qlogo.cn/mmopen/vi_32/Q0j4TwGTfTJdB0PgiagfrjwNpjDujYns8kgLBUn5fYuJDPu5xUT0B3jnbljIrcibHBsp2owe7IbxxdIEgbu2TqXg/132",
"memberUrl": "http://www.wechat.com",
"memberDomain": "wechat",
"good": 0,
"bad": 0,
"police": 0,
"parentSeq": 38469774,
"directSeq": 0,
"shortUrl": null,
"title": "Hello world!",
"site": "http://www.santostang.com/2018/07/04/hello-world/",
"email": null,
"ipAddress": "183.250.210.197",
"isMobile": "0",
"agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"septSns": null,
"targetService": null,
"targetUserName": null,
"info1": null,
"info2": null,
"info3": null,
"image1": null,
"image2": null,
"image3": null,
"link1": null,
"link2": null,
"link3": null,
"isSecret": 0,
"isModified": 1,
"confirm": 0,
"subCount": 0,
"regdate": "2019-08-17T04:37:26.000Z",
"deletedDate": null,
"file1": null,
"file2": null,
"file3": null,
"additionalSeq": 0,
"content": "测试网络爬虫",
"quotationSeq": null,
"quotationContent": null,
"consumerSeq": 1020,
"livereSeq": 28583,
"repSeq": 4272904,
"memberGroupSeq": 31224852,
"memberSeq": 31765380,
"status": 0,
"repGroupSeq": 0,
"adminSeq": 25413747,
"deleteReason": null,
"sticker": 0,
"version": null
}, {
"replySeq": 38444830,
"name": "照空",
"memberId": "UID_E89F1487563A6463B8C2653589A26C13",
"memberIcon": "http://thirdqq.qlogo.cn/g?b=oidb&k=79JiaH75XKXia1d8y0CMPiaxA&s=100&t=1555482527",
"memberUrl": "https://qq.com/",
"memberDomain": "qq",
"good": 0,
"bad": 0,
"police": 0,
"parentSeq": 38444830,
"directSeq": 0,
"shortUrl": null,
"title": "Hello world!",
"site": "http://www.santostang.com/2018/07/04/hello-world/",
"email": null,
"ipAddress": "113.222.176.166",
"isMobile": "0",
"agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"septSns": null,
"targetService": null,
"targetUserName": null,
"info1": null,
"info2": null,
"info3": null,
"image1": null,
"image2": null,
"image3": null,
"link1": null,
"link2": null,
"link3": null,
"isSecret": 0,
"isModified": 0,
"confirm": 0,
"subCount": 0,
"regdate": "2019-08-14T07:41:17.000Z",
"deletedDate": null,
"file1": null,
"file2": null,
"file3": null,
"additionalSeq": 0,
"content": "test",
"quotationSeq": null,
"quotationContent": null,
"consumerSeq": 1020,
"livereSeq": 28583,
"repSeq": 4272904,
"memberGroupSeq": 31210855,
"memberSeq": 31751262,
"status": 0,
"repGroupSeq": 0,
"adminSeq": 25413747,
"deleteReason": null,
"sticker": 0,
"version": null
}, {
"replySeq": 38356587,
"name": "O0o0O0o0O",
"memberId": "UID_185A17117B12BEA662B3FEB1A8F9D657",
"memberIcon": "http://thirdqq.qlogo.cn/g?b=oidb&k=nj3cibyjaMgcZLianK9p7a5Q&s=100&t=1562782712",
"memberUrl": "https://qq.com/",
"memberDomain": "qq",
"good": 1,
"bad": 0,
"police": 0,
"parentSeq": 38356587,
"directSeq": 0,
"shortUrl": null,
"title": "Hello world!",
"site": "http://www.santostang.com/2018/07/04/hello-world/",
"email": null,
"ipAddress": "1.25.148.187",
"isMobile": "0",
"agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
"septSns": null,
"targetService": null,
"targetUserName": null,
"info1": null,
"info2": null,
"info3": null,
"image1": null,
"image2": null,
"image3": null,
"link1": null,
"link2": null,
"link3": null,
"isSecret": 0,
"isModified": 0,
"confirm": 0,
"subCount": 1,
"regdate": "2019-08-05T06:34:06.000Z",
"deletedDate": null,
"file1": null,
"file2": null,
"file3": null,
"additionalSeq": 0,
"content": "老师你好,我看的是您的第一版书,其中第64页上面您说(.*?)只匹配了smarter,但是63页印的结果是smarter than \n而且我自己测试代码是也是smarter than我把(.*?)改成(.*)后也是smarter than ,去官网查看了一下,好像是*?叫懒惰匹配\n因为我测试了代码的确是书上结果,但是和后面您说的不一样,是一些其他的原因吗?",
"quotationSeq": null,
"quotationContent": null,
"consumerSeq": 1020,
"livereSeq": 28583,
"repSeq": 4272904,
"memberGroupSeq": 31162013,
"memberSeq": 31701971,
"status": 0,
"repGroupSeq": 0,
"adminSeq": 25413747,
"deleteReason": null,
"sticker": 0,
"version": null
}, {
"replySeq": 38349288,
"name": "雨与雨",
"memberId": "UID_FBEEEF983EA93BE422BB0FB802493F07",
"memberIcon": "http://thirdqq.qlogo.cn/g?b=oidb&k=0hP4VIwPrEx5icPVloTCC9A&s=100&t=1557075356",
"memberUrl": "https://qq.com/",
"memberDomain": "qq",
"good": 0,
"bad": 0,
"police": 0,
"parentSeq": 38349288,
"directSeq": 0,
"shortUrl": null,
"title": "Hello world!",
"site": "http://www.santostang.com/2018/07/04/hello-world/",
"email": null,
"ipAddress": "106.6.201.121",
"isMobile": "0",
"agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"septSns": null,
"targetService": null,
"targetUserName": null,
"info1": null,
"info2": null,
"info3": null,
"image1": null,
"image2": null,
"image3": null,
"link1": null,
"link2": null,
"link3": null,
"isSecret": 0,
"isModified": 0,
"confirm": 0,
"subCount": 0,
"regdate": "2019-08-04T12:27:28.000Z",
"deletedDate": null,
"file1": null,
"file2": null,
"file3": null,
"additionalSeq": 0,
"content": "123",
"quotationSeq": null,
"quotationContent": null,
"consumerSeq": 1020,
"livereSeq": 28583,
"repSeq": 4272904,
"memberGroupSeq": 31158374,
"memberSeq": 31698300,
"status": 0,
"repGroupSeq": 0,
"adminSeq": 25413747,
"deleteReason": null,
"sticker": 0,
"version": null
}, {
"replySeq": 38349283,
"name": "雨与雨",
"memberId": "UID_FBEEEF983EA93BE422BB0FB802493F07",
"memberIcon": "http://thirdqq.qlogo.cn/g?b=oidb&k=0hP4VIwPrEx5icPVloTCC9A&s=100&t=1557075356",
"memberUrl": "https://qq.com/",
"memberDomain": "qq",
"good": 0,
"bad": 0,
"police": 0,
"parentSeq": 38349283,
"directSeq": 0,
"shortUrl": null,
"title": "Hello world!",
"site": "http://www.santostang.com/2018/07/04/hello-world/",
"email": null,
"ipAddress": "106.6.201.121",
"isMobile": "0",
"agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"septSns": null,
"targetService": null,
"targetUserName": null,
"info1": null,
"info2": null,
"info3": null,
"image1": null,
"image2": null,
"image3": null,
"link1": null,
"link2": null,
"link3": null,
"isSecret": 0,
"isModified": 0,
"confirm": 0,
"subCount": 0,
"regdate": "2019-08-04T12:27:09.000Z",
"deletedDate": null,
"file1": null,
"file2": null,
"file3": null,
"additionalSeq": 0,
"content": "677",
"quotationSeq": null,
"quotationContent": null,
"consumerSeq": 1020,
"livereSeq": 28583,
"repSeq": 4272904,
"memberGroupSeq": 31158374,
"memberSeq": 31698300,
"status": 0,
"repGroupSeq": 0,
"adminSeq": 25413747,
"deleteReason": null,
"sticker": 0,
"version": null
}, {
"replySeq": 38319183,
"name": "iverson",
"memberId": "oBVoaxCVOAzR24xKYcrYOpCcU6LM",
"memberIcon": "http://thirdwx.qlogo.cn/mmopen/vi_32/Q0j4TwGTfTKOq59TrO00BVEw7JtUpG8Wcf6OeJzEEkiapyHQ3AeuU5r5yDbdSgFykLxbFGOe6nRQ9xIZ66jYgrA/132",
"memberUrl": "http://www.wechat.com",
"memberDomain": "wechat",
"good": 0,
"bad": 0,
"police": 0,
"parentSeq": 38319183,
"directSeq": 0,
"shortUrl": null,
"title": "Hello world!",
"site": "http://www.santostang.com/2018/07/04/hello-world/",
"email": null,
"ipAddress": "36.18.100.187",
"isMobile": "0",
"agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36",
"septSns": null,
"targetService": null,
"targetUserName": null,
"info1": null,
"info2": null,
"info3": null,
"image1": null,
"image2": null,
"image3": null,
"link1": null,
"link2": null,
"link3": null,
"isSecret": 0,
"isModified": 0,
"confirm": 0,
"subCount": 0,
"regdate": "2019-08-01T07:11:19.000Z",
"deletedDate": null,
"file1": null,
"file2": null,
"file3": null,
"additionalSeq": 0,
"content": "wyf的test",
"quotationSeq": null,
"quotationContent": null,
"consumerSeq": 1020,
"livereSeq": 28583,
"repSeq": 4272904,
"memberGroupSeq": 31142089,
"memberSeq": 31681833,
"status": 0,
"repGroupSeq": 0,
"adminSeq": 25413747,
"deleteReason": null,
"sticker": 0,
"version": null
}, {
"replySeq": 38311822,
"name": "Liwkns",
"memberId": "UID_ACECDE844B8032C155C09FC0B38C7BC2",
"memberIcon": "http://thirdqq.qlogo.cn/g?b=oidb&k=0KafAicLdsVfAamlJzO470g&s=100&t=1562941631",
"memberUrl": "https://qq.com/",
"memberDomain": "qq",
"good": 0,
"bad": 0,
"police": 0,
"parentSeq": 38311822,
"directSeq": 0,
"shortUrl": null,
"title": "Hello world!",
"site": "http://www.santostang.com/2018/07/04/hello-world/",
"email": null,
"ipAddress": "61.186.190.44",
"isMobile": "0",
"agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0",
"septSns": null,
"targetService": null,
"targetUserName": null,
"info1": null,
"info2": null,
"info3": null,
"image1": null,
"image2": null,
"image3": null,
"link1": null,
"link2": null,
"link3": null,
"isSecret": 0,
"isModified": 0,
"confirm": 0,
"subCount": 0,
"regdate": "2019-07-31T07:41:44.000Z",
"deletedDate": null,
"file1": null,
"file2": null,
"file3": null,
"additionalSeq": 0,
"content": "4.3节用Selenium打开了网页,但是查看源代码的时候,评论那一块仍然是动态数据,有没有遇到一样问题的小伙伴交流一下啊 QQ424524128",
"quotationSeq": null,
"quotationContent": null,
"consumerSeq": 1020,
"livereSeq": 28583,
"repSeq": 4272904,
"memberGroupSeq": 31135561,
"memberSeq": 31675229,
"status": 0,
"repGroupSeq": 0,
"adminSeq": 25413747,
"deleteReason": null,
"sticker": 0,
"version": null
}, {
"replySeq": 38309568,
"name": "wmn",
"memberId": "wangmengningswu@163.com",
"memberIcon": "https://cdn-city.livere.com/images/user_profile_1.png",
"memberUrl": null,
"memberDomain": "livere",
"good": 0,
"bad": 0,
"police": 0,
"parentSeq": 38309568,
"directSeq": 0,
"shortUrl": null,
"title": "Hello world!",
"site": "http://www.santostang.com/2018/07/04/hello-world/",
"email": null,
"ipAddress": "61.186.190.44",
"isMobile": "0",
"agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
"septSns": null,
"targetService": null,
"targetUserName": null,
"info1": null,
"info2": null,
"info3": null,
"image1": null,
"image2": null,
"image3": null,
"link1": null,
"link2": null,
"link3": null,
"isSecret": 0,
"isModified": 0,
"confirm": 0,
"subCount": 0,
"regdate": "2019-07-31T02:55:19.000Z",
"deletedDate": null,
"file1": null,
"file2": null,
"file3": null,
"additionalSeq": 0,
"content": "2019.7.31",
"quotationSeq": null,
"quotationContent": null,
"consumerSeq": 1020,
"livereSeq": 28583,
"repSeq": 4272904,
"memberGroupSeq": 0,
"memberSeq": 0,
"status": 0,
"repGroupSeq": 0,
"adminSeq": 25413747,
"deleteReason": null,
"sticker": 0,
"version": null
}],
"children": [{
"replySeq": 38358130,
"name": "O0o0O0o0O",
"memberId": "UID_185A17117B12BEA662B3FEB1A8F9D657",
"memberIcon": "http://thirdqq.qlogo.cn/g?b=oidb&k=nj3cibyjaMgcZLianK9p7a5Q&s=100&t=1562782712",
"memberUrl": "https://qq.com/",
"memberDomain": "qq",
"good": 0,
"bad": 0,
"police": 0,
"parentSeq": 38356587,
"directSeq": 38356587,
"shortUrl": null,
"title": "Hello world!",
"site": "http://www.santostang.com/2018/07/04/hello-world/",
"email": null,
"ipAddress": "1.25.148.187",
"isMobile": "0",
"agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
"septSns": null,
"targetService": null,
"targetUserName": null,
"info1": null,
"info2": null,
"info3": null,
"image1": null,
"image2": null,
"image3": null,
"link1": null,
"link2": null,
"link3": null,
"isSecret": 0,
"isModified": 0,
"confirm": 0,
"subCount": 0,
"regdate": "2019-08-05T08:20:28.000Z",
"deletedDate": null,
"file1": null,
"file2": null,
"file3": null,
"additionalSeq": 0,
"content": "我又测试了下觉得可能是正则表达式中的dogs限制了原因,懒惰匹配是尽可能少的,也就意味着可能找不到返回none也是正常的,因为dogs的存在,所以限制了必须要把前面的smarter than全部匹配,如果没有dogs的话,我测试是可以的,就只是匹配了smarter",
"quotationSeq": null,
"quotationContent": null,
"consumerSeq": 1020,
"livereSeq": 28583,
"repSeq": 4272904,
"memberGroupSeq": 31162013,
"memberSeq": 31701971,
"status": 0,
"repGroupSeq": 0,
"adminSeq": 25413747,
"deleteReason": null,
"sticker": 0,
"version": null
}],
"quotations": []
},
"resultCode": 200,
"resultMessage": "Okay, livere"
});
整体代码:
import requests
link = """https://api-zero.livere.com/v1/comments/list?callback=jQuery112403473268296510956_1531502963311&limit=10&repSeq=4272904&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&_=1531502963313"""
headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
r = requests.get(link, headers= headers)
print (r.text)
# 获取 json 的 string
json_string = r.text
json_string = json_string[json_string.find('{'):-2]
# 从第一个左大括号提取,最后的两个字符 - 括号和分号不取
import json
json_data = json.loads(json_string)
comment_list = json_data['results']['parents']
i = 0
for eachone in comment_list:
message = eachone['content']
i = i + 1
print ('i=', i,'-----', message)
print ('\n')
运行结果:不是全部
i= 1 ----- 有点难哦
i= 2 ----- 试试解析真实地址抓取
i= 3 ----- 测试网络爬虫
i= 4 ----- test
i= 5 ----- 老师你好,我看的是您的第一版书,其中第64页上面您说(.*?)只匹配了smarter,但是63页印的结果是smarter than
而且我自己测试代码是也是smarter than我把(.*?)改成(.*)后也是smarter than ,去官网查看了一下,好像是*?叫懒惰匹配
因为我测试了代码的确是书上结果,但是和后面您说的不一样,是一些其他的原因吗?
i= 6 ----- 123
i= 7 ----- 677
i= 8 ----- wyf的test
i= 9 ----- 4.3节用Selenium打开了网页,但是查看源代码的时候,评论那一块仍然是动态数据,有没有遇到一样问题的小伙伴交流一下啊 QQ424524128
i= 10 ----- 2019.7.31
怎么 找到的这个 真实地址
http://www.santostang.com/2018/07/04/hello-world/
最后的代码:
import requests
import json
def single_page_comment(link):
headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
r = requests.get(link, headers=headers)
# 获取 json 的 string
json_string = r.text
json_string = json_string[json_string.find('{'):-2]
json_data = json.loads(json_string)
comment_list = json_data['results']['parents']
for eachone in comment_list:
message = eachone['content']
print(message)
for page in range(1, 4):
link1 = "https://api-zero.livere.com/v1/comments/list?callback=jQuery1124019104109778374867_1566439164308&limit=10&offset="
link2 = "&repSeq=4272904&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&_=1566439164314"
page_str = str(page)
link = link1 + page_str + link2
print(link)
single_page_comment(link)
link1 和 link2 的由来是 通过 Network---- > JS 中 ,在网页中多次点击如下图片 抓取到的。