url:今日头条,搜索“街拍”并打开
https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D
浏览器:firefox
分析:打开页面,空白处单击鼠标右键,选择 ”查看元素”
在下面弹出元素框内选择 网络,并在右边的框内选择 XHR
网页往下拉。。。。。
直到元素框有数据出现。
#coding=utf-8
import re
import time
import requests
from urllib.parse import urlencode
from requests.exceptions import RequestException
def getHtml(url,offset):
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0'} #伪装一下
data = {"autoload":"true",
"count":'20',
"cur_tab":'1',
"format":"json",
"from":"search_tab",
"keyword":"街拍",
"offset":offset,
}
url = url + urlencode(data)
response = requests.get(url,headers = headers).json()
return response
except RequestException:
return None
if __name__=='__main__':
urls = []
url = "https://www.toutiao.com/search_content/?"
host_url = "https://www.toutiao.com/a"
for offset in range(0,20,20):
html = getHtml(url,offset)
if html is None:
continue
data = html.get('data')
for item in data:
group = item.get('group_id')
if group is None:
continue
group = host_url + group
urls.append(group)
time.sleep(1)
for url in urls:
print (url)
欢迎关注本人微信公众号,会分享更多的干货: