今日头条获取街拍AJKX处理爬取
我们知道今日头条,是由字节跳动这样的大公司创办的,所以他上面有些有版权的东西是不能进行爬取的,不过今天我们就来爬取,因为里面一些内容是有js加密的,或者是ajkx请求发起才可以去等到数据,这样子后台才可以把数据给你,我们爬虫就是通过机器模拟浏览器请求从而得到数据,如果我们要获得ajkx保存得数据得话,我们是要通过模拟ajkx请求才可以获得,不过这个请求模拟比较复杂,不过这次,我们就来试试。
如果你想简单了解什么是AJKX请求,请看 https://blog.csdn.net/Deng872347348/article/details/113814708
我们首先我们要确认爬取得页面:
如图1:
平时我们爬取一些简单的网页,我们都是通过三步走就行了
import requests
url='http://www.baidu.com'
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
resp=requests.get(url,headers=headers).text
print(resp)
平时我们都是通这个就可以获取html页面内容了,但是,当AJKX请求数据的时候
https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D
如上面他后面的keyword是加密的,因为他后面的数据要通过浏览器在发送AJKX请求才可以后台服务器给你数据
平常我们看网页如图2:
但是当我们AJKX请求的时候我们要看如图3:
我们在通过浏览器往下翻的时候,图3那里的数据会慢慢的加载出来。这个就是ajkx请求,发送不是浏览器请求它。
如图片4:
我们要模拟浏览器的话,就要把上面的表单内容放到代码中,这样模拟浏览器。
下面正式开始:
由于是AJKX请求,我们要要用slelenium模拟浏览器,我们这次不是模拟登录,所以我们就用无头浏览器的模拟就行
from selenium import webdriver
def get_cookies(url):
# 自动化浏览器窗口
options = webdriver.ChromeOptions()
options.add_argument('--headless')
browser = webdriver.Chrome(options=options)
browser.get(url)
下一步我们就是headers:
headers = {
'cookie': cookies,
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
'referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D'
}
如图5,我们要获取cookies
这个很麻烦,有时候他会发生动态变化,所有我们就动态获取cookies:
cookies = get_cookies('https://www.toutiao.com/')
headers = {
'cookie': cookies,
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
'referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D'
}
str = ''#cookies的值是一个字符串,所以我们动态获取cookies
options = webdriver.ChromeOptions()
options.add_argument('--headless')
browser = webdriver.Chrome(options=options)
browser.get(url)
#获取https://www.toutiao.com/这个页面里面的动态的cookies
for i in browser.get_cookies():
try:
name = i.get("name")
value = i.get("value")
str = str + name + '=' + value + ';'
except ValueError as e:
print(e)
return str#把
后面我们要如图4表单里面的信息给添加进去
def get_page(offset):
# params
params = {
"aid": "24",
"app_name": "web_search",
"offset": "60",
"format": offset,#这个是每个标题的json数据,是动态加载的
"keyword": "街拍",
"autoload": "true",
"count": "20",
"en_qc": "1",
"cur_tab": "1",
"from": "search_tab",
"pd": "synthesis",
}
# url
url = "https://www.toutiao.com/api/search/content/"
# reponse
#异常处理判断是否模拟成功,得到json()数据
try:
r = requests.get(url, params=params, headers=headers)
if r.status_code == 200:
return r.json()
else:
print("requests get_page error!")
except requests.ConnectionError:
return None
下面,我们来加载Json数据,并且提取json()数据
如图6,我们可以看到我们点击进去后面我们可以看到json数据
我们就来找里面的json数据,进行匹配,这个里面的json数据html页面不同,我们只能用正则进行匹配
data = json.get("data")
if data:
for i in data:
if i.get('title'):
title = re.sub('[\t]', ',', i.get('title')) # re.sub() 正则高级替换 复杂替换
url = i.get('article_url')
if url:
r = requests.get(url, headers=headers)
if r.status_code == 200:
imgags_pattern = re.compile('JSON.parse\("(.*?)"\),\n', re.S) # .*?
result = re.search(imgags_pattern, r.text)
if result:
b_url = 'https://p3.pstatp.com/origin/pgc-image/'
up = re.compile('url(.*?)"width', re.S) # re.S 整体进行匹配
results = re.findall(up, result.group(1))
if results:
for result in results:
yield {
'title': title,
'image': b_url + re.search('F([^F]*)\\\\",', result).group(1)
}
else:
images = i.get('image_list')
for image in images:
origin_image = re.sub("list.*?pgc-image", "large/pgc-image",
image.get('url')) # 改成origin/pgc-image是原图
yield {
'image': origin_image,
'title': title
}
下面是数据的持久化,数据的存储
title = re.sub(r"[./\\,,!!??|]","",item.get('title'))
img_path = "img" + os.path.sep + title
if not os.path.exists(img_path):
os.makedirs(img_path) # 生成目录文件夹
try:
resp = requests.get(item.get('image'))
if requests.codes.ok == resp.status_code:
file_path = img_path + os.path.sep + '{file_name}.{file_suffix}'.format(
file_name=md5(resp.content).hexdigest(),
file_suffix='jpg') # 单一文件的路径
if not os.path.exists(file_path):
with open(file_path, 'wb')as f:
f.write(resp.content)
print("Downloaded image path is %s" % file_path)
else:
print('Already Downloade', file_path)
except Exception as e:
print(e, 'noe123')
cookies = get_cookies('https://www.toutiao.com/')
headers = {
'cookie': cookies,
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
'referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D'
}
主程序的调用:
def main(offset):
a = get_page(offset)
for i in get_images(a):
save_image(i)
if __name__ == '__main__':
# pool多进程不能实现跨进程共享cookies
for i in [x * 20 for x in range(3)]:
main(i)
完整代码:
import requests, re, os
from hashlib import md5 # 去重
from selenium import webdriver
def get_cookies(url):
# 自动化浏览器窗口
str = ''
options = webdriver.ChromeOptions()
options.add_argument('--headless')
browser = webdriver.Chrome(options=options)
browser.get(url)
for i in browser.get_cookies():
try:
name = i.get("name")
value = i.get("value")
str = str + name + '=' + value + ';'
except ValueError as e:
print(e)
return str
# headers
def get_page(offset):
# params
params = {
"aid": "24",
"app_name": "web_search",
"offset": "60",
"format": offset,
"keyword": "街拍",
"autoload": "true",
"count": "20",
"en_qc": "1",
"cur_tab": "1",
"from": "search_tab",
"pd": "synthesis",
}
# url
url = "https://www.toutiao.com/api/search/content/"
# reponse
try:
r = requests.get(url, params=params, headers=headers)
if r.status_code == 200:
return r.json()
else:
print("requests get_page error!")
except requests.ConnectionError:
return None
def get_images(json):
data = json.get("data")
if data:
for i in data:
if i.get('title'):
title = re.sub('[\t]', ',', i.get('title')) # re.sub() 正则高级替换 复杂替换
url = i.get('article_url')
if url:
r = requests.get(url, headers=headers)
if r.status_code == 200:
imgags_pattern = re.compile('JSON.parse\("(.*?)"\),\n', re.S) # .*?
result = re.search(imgags_pattern, r.text)
if result:
b_url = 'https://p3.pstatp.com/origin/pgc-image/'
up = re.compile('url(.*?)"width', re.S) # re.S 整体进行匹配
results = re.findall(up, result.group(1))
if results:
for result in results:
yield {
'title': title,
'image': b_url + re.search('F([^F]*)\\\\",', result).group(1)
}
else:
images = i.get('image_list')
for image in images:
origin_image = re.sub("list.*?pgc-image", "large/pgc-image",
image.get('url')) # 改成origin/pgc-image是原图
yield {
'image': origin_image,
'title': title
}
def save_image(item):
title = re.sub(r"[./\\,,!!??|]","",item.get('title'))
img_path = "img" + os.path.sep + title
if not os.path.exists(img_path):
os.makedirs(img_path) # 生成目录文件夹
try:
resp = requests.get(item.get('image'))
if requests.codes.ok == resp.status_code:
file_path = img_path + os.path.sep + '{file_name}.{file_suffix}'.format(
file_name=md5(resp.content).hexdigest(),
file_suffix='jpg') # 单一文件的路径
if not os.path.exists(file_path):
with open(file_path, 'wb')as f:
f.write(resp.content)
print("Downloaded image path is %s" % file_path)
else:
print('Already Downloade', file_path)
except Exception as e:
print(e, 'noe123')
cookies = get_cookies('https://www.toutiao.com/')
headers = {
'cookie': cookies,
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
'referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D'
}
# 解析网页
# 保存文件
def main(offset):
a = get_page(offset)
for i in get_images(a):
save_image(i)
if __name__ == '__main__':
# pool多进程不能实现跨进程共享cookies
for i in [x * 20 for x in range(3)]:
main(i)
效果图片: