本人正在自学python,所以学习了一下python爬虫。以下是自己做的小测试
以爬取图虫网的免费图片为例
1,首先打开图虫网搜索美女,打开f12刷新,看发送的链接找到查询图片相关的链接,如图发现在返回的页面中script标签中window.hits 属性包含图片有关的信息imageId(链接是https://stock.tuchong.com/free/search/?term= 搜索条件)
2,点击图片查询单张图片找到如图下所示静态图片地址(发现规律,图片名称是用image+.jpg拼接而成)
如上思虑如下
(1)访问链接https://stock.tuchong.com/free/search/?term= 搜索条件 获取网页,然后拿到网页script下的imageId
(2)遍历imageId拼接图片路径https://weiliicimg9.pstatp.com/weili/l/imageId.jpg
(3)访问图片路径下载到本地即可
代码如下(简单入门:本人python也是刚刚学,所以没做太多格式处理只简单写了代码):
导包
import requests
from bs4 import BeautifulSoup
import json
import time
import random
import hashlib
import urllib.request
import os
# 爬取图虫免费图片
def get_TCImage():
try:
sousuo = input("请输入搜索:")
#初始链接用于获取imageId
base_url = "https://stock.tuchong.com/free/search/?term=" + sousuo
#请求头部信息
headers = {
"accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36 LBBROWSER',
"method":"GET",
"authority":"www.zhihu.com",
"Cookie":"PHPSESSID=d9btnttnuij4akdn3suvc7qu58; lang=zh; _ga=GA1.2.1657218312.1591171578; _gid=GA1.2.1930509186.1591171578; weilisessionid3=50251f3a9006932df81a1580b3424975; wluuid=WLGEUST-2BE80BC3-7806-AF12-DCD5-AC5BE16E24C4; wlsource=tc_pc_home_search; ssoflag=0; _ga=GA1.3.1657218312.1591171578; _gid=GA1.3.1930509186.1591171578; Hm_lvt_f212e14a5ffb8199fd0e64061c054314=1591172747; qimo_seosource_e7dfc0b0-b3b6-11e7-b58e-df773034efe4=%E7%AB%99%E5%86%85; qimo_seokeywords_e7dfc0b0-b3b6-11e7-b58e-df773034efe4=; href=http%3A%2F%2Fstock.tuchong.com%2Fsearch%3Fsource%3Dtc_pc_home_search%26term%3D%25E8%2583%258C%25E6%2599%25AF; accessId=e7dfc0b0-b3b6-11e7-b58e-df773034efe4; wltoken=71e853d818590e19; wluserid=2000673; wlnickname=%E4%B8%AA%E4%BA%BA%E4%B8%AD%E5%BF%83; _gat=1; webp_enabled=1; Hm_lpvt_f212e14a5ffb8199fd0e64061c054314=1591173769; pageViewNum=7"
}
#发送请求获取相应
response = requests.get(base_url,headers=headers)
#响应转化为txt
html = response.text
#解析爬取的网页
bs = BeautifulSoup(html,'html.parser')
# print(bs.prettify())
#获取网页中的所有script标签
ba = bs.find_all('script')
#获取script标签下含有imgid的字符串并切分得到需要的字符串(也可用正则表达式获取。注:正则表达式[\d\D]*可以匹配任意数量字符包括换行符)
image_str = '['+str(ba[1].string).split(']')[0].split('window.hits = [')[1]+']'
image_json = json.loads(image_str)
print(image_json)
#遍历json获取imageId
for i in image_json:
imageId = i['imageId']
#根据imageId拼接访问获取图片的路径
image_url = 'https://weiliicimg9.pstatp.com/weili/l/'+imageId+'.jpg'
time.sleep(1)
#下载图片
#下载到当前目录
dir = os.getcwd()
print('下载图片:' + image_url)
# urllib.request.urlretrieve(bu,dir + '/tc'+h+'.jpg',headers=headers) 直接用此方式下载的话会报 HTTP Error 403: Forbidden 错误,所以需要加上请求头信息
request = urllib.request
opener = request.build_opener()
opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36')]
request.install_opener(opener)
request.urlretrieve(url=image_url,filename=dir + '/tc'+imageId+'.jpg')
except Exception as e:
print(e)
#执行方法
get_TCImage()
执行后成功下载