python·爬爬爬(虫1)
大数据 第一个生命周期-数据采集
分类
通用爬虫
- 百度 谷歌 360 搜索
- 网页上所有数据全要
聚焦爬虫
- 只要其中一部分数据
我的网页
在pycharm里面新建一个html文件:
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>斑马的官方网站</title>
<style>
#kangbazi{
color:green;
font-size:50px;
}
.znmd{
color:red;
font-size:60px;
}
</style>
</head>
<body>
<h1>写字楼里写字间,写字间里程序员;</h1>
<h2> 程序人员写程序,又拿程序换酒钱。 </h2>
<h3>酒醒只在网上坐,酒醉还来网下眠;</h3>
<h4>酒醉酒醒日复日,网上网下年复年。</h4>
<h5>但愿老死电脑间,不愿鞠躬老板前;</h5>
<h6>奔驰宝马贵者趣,公交自行程序员。</h6>
<h7>别人笑我忒疯癫,我笑自己命太贱;</h7>
<h8>不见满街漂亮妹,哪个归得程序员?</h8>
<div id="kangbazi" class="'znmd">
知道我是什么星座吗?射手吗?不是,为你量身定做
</div>
<div class="znmd">
你上辈子一定是碳酸饮料,要不然一见到你就冒泡泡!
</div>
</body>
</html>
用谷歌运行:
数据分析的过程
1. 模拟浏览器向服务器发送请求
2. 获取浏览器返回的数据
3. 数据解析 提前我们需要的数据
4. 数据存储
5. 数据清洗过滤
6. 数据分析
7. 结果分析
打开网页——右击,检查——Network——All——刷新——点击第一个——
Headers:请求头
Accept:可接受类型
Cookie:
Cookie 是一些数据, 存储于你电脑上的文本文件中。
当 web 服务器向浏览器发送 web 页面时,在连接关闭后,服务端不会记录用户的信息。
Cookie 的作用就是用于解决 “如何记录客户端的用户信息”:
当用户访问 web 页面时,他的名字可以记录在 cookie 中。
在用户下一次访问该页面时,可以在 cookie 中读取用户访问记录。
反爬 反反爬
最终赢得一定是爬虫
request
from urllib import request
#下载图片或下载网址
request.urlretrieve('https://tse3-mm.cn.bing.net/th/id/OIP-C.XEsRHp0jX8o4uRuBRsGWvQHaLF?w=200&h=299&c=7&o=5&pid=1.7','meinv.jpg')
request.urlretrieve('https://www.scuec.edu.cn/','znmd.html')
parse
#协议 域名 关键字
#https://www.baidu.com/s?ie=UTF-8&wd=%E5%90%B4%E4%BA%A6%E5%87%A1
# url 会把中文进行编码 转成其他能识别的
import urllib.parse
kw = input('请输入要查询的内容')
param = {'wd':kw,'ie':'utf-8'}
params= urllib.parse.urlencode(param) #将字典解析成参数字符串
#url = 'https://www.baidu.com/s?wd=吴亦凡&ie=utf-8'
url = 'https://www.baidu.com/s?'+params
print(url)
# 请输入要查询的内容
# https://www.baidu.com/s?wd=%E5%90%B4%E4%BA%A6%E5%87%A1&ie=utf-8
requests
get()请求:速度快,不安全
post()请求:速度慢,安全
import requests
import urllib.parse
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
kw = input('请输入要查询的内容')
param = {'wd':kw,'ie':'utf-8'}
params= urllib.parse.urlencode(param) #将字典解析成参数字符串
#url = 'https://www.baidu.com/s?wd=吴亦凡&ie=utf-8'
url = 'https://www.baidu.com/s?'+params
#print(url)
response = requests.get(url,headers=headers)
with open('wuyifan.html','w',encoding='utf-8') as fp:
fp.write(response.text)
fp.flush()
爬取wzry皮肤
# 八种尺寸
#requests库 拿到页面源代码
#获取图片的地址
#使用urlretrieve将图片拿下来
#爬数据 最好的方法是 一个url地址
在当前目录新建文件夹king:
import requests # pip install requests
import re
import urllib.parse
from urllib import request
print("[2]1024x768\t\t[3]1280x720\t\t[4]1280x1024\t\t[5]1440x900\t\t[6]1920x1080\t\t[7]1920x1200\t\t[8]1920x1440")
size = input('请在以上尺寸中选择一项并输入括号内的数字:')
for x in range(25):
url = "https://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi?activityId=2735&sVerifyCode=ABCD&sDataType=JSON&iListNum=4&totalpage=0&page="+str(x)+"&iOrder=0&iSortNumClose=1&jsoncallback=jQuery1710005403016155852258_1625972438150&iAMSActivityId=51991&_everyRead=true&iTypeId=1&iFlowId=267733&iActId=2735&iModuleId=2735&_=1625972438266"
# print(url)
response = requests.get(url)
# print(response.text)
# print(response.content.decode('utf-8'))#二进制 手动解码
data = response.content.decode('utf-8')
imgSize = {"2":"sProdImgNo_2","3":"sProdImgNo_3","4":"sProdImgNo_4","5":"sProdImgNo_5","6":"sProdImgNo_6","7":"sProdImgNo_7","8":"sProdImgNo_8"}
# 获取所有指定尺寸的url地址
sizeList = re.findall('"'+imgSize[size]+'":"(.*?)"',data)
# 获取所有图片的名字
nameList = re.findall('"sProdName":"(.*?)"',data)
# print(sizeList) #列表
for x in range(0,len(sizeList)):
imgurl = urllib.parse.unquote(sizeList[x]).replace('/200','/0')
imgName = urllib.parse.unquote(nameList[x])
print('正在下载:'+imgName+'.jpg')
request.urlretrieve(imgurl, 'king/'+imgName+'.jpg')
request.urlcleanup() # 清楚缓存
print(imgName+'.jpg下载完毕')
爬取wzry英雄
import requests
import json
from urllib import request
# 每个英雄有不同张图片
# url https://pvp.qq.com/web201605/js/herolist.json #英雄列表
# https://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/变化/变化-bigskin-变化.jpg #图片地址
url = 'https://pvp.qq.com/web201605/js/herolist.json' #获取所有英雄的信息
res = requests.get(url) #发送请求 获取结果
# data = json.loads(res.text) #json 转字典
# print(data)
data_dict_list = res.json() #json 转普通python类型 列表
#print(data_dict_list)
for data in data_dict_list:
print('英雄',data['cname'])
if 'skin_name' in data:
skin_name = data['skin_name']
print('皮肤名:',skin_name)
# 拼接图片地址
skin_num = skin_name.count('|')+1
print(skin_num)
#根据个数 拼接图片URL
for x in range(1,skin_num+1):
href = f'https://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{str(data["ename"])}/{str(data["ename"])}-bigskin-{str(x)}.jpg'
print(href)
request.urlretrieve(href,'heros/'+data['cname']+'_'+str(x)+'.jpg')
else:
print('json文件不完整')
提升速度 多线程
import requests
import json
from urllib import request
from concurrent.futures.thread import ThreadPoolExecutor
def download_image(url,name):
request.urlretrieve(url, 'heros/' + name + '_' + str(x) + '.jpg')
with ThreadPoolExecutor(max_workers=10) as pool:
# 每个英雄有不同张图片
# url https://pvp.qq.com/web201605/js/herolist.json #英雄列表
# https://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/变化/变化-bigskin-变化.jpg #图片地址
url = 'https://pvp.qq.com/web201605/js/herolist.json' # 获取所有英雄的信息
res = requests.get(url) # 发送请求 获取结果
# data = json.loads(res.text) #json 转字典
# print(data)
data_dict_list = res.json() # json 转普通python类型 列表
# print(data_dict_list)
for data in data_dict_list:
print('英雄', data['cname'])
if 'skin_name' in data:
skin_name = data['skin_name']
print('皮肤名:', skin_name)
# 拼接图片地址
skin_num = skin_name.count('|') + 1
print(skin_num)
# 根据个数 拼接图片URL
for x in range(1, skin_num + 1):
href = f'https://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{str(data["ename"])}/{str(data["ename"])}-bigskin-{str(x)}.jpg'
# print(href)
# request.urlretrieve(href,'heros/'+data['cname']+'_'+str(x)+'.jpg')
pool.submit(download_image,href,data['cname'])
else:
print('json文件不完整')
多线程爬取dy
import requests
import threading
def get_douyu(url):
'''
获取图片并保存
:param url: 传入url 地址
:return:
'''
res = requests.get(url) #发送请求 获取网页源代码
json_data = res.json() #json 类型数据
rl_list = json_data['data']['rl'] #json 转普通python类型 列表
#print(rl_list)
for zb in rl_list:
name = zb.get('nn') #主播名字
imageUrl = zb.get('rs16') #图片地址
imagePath = 'douyu/%s.jpg' % name #图像保存的路径
res = requests.get(imageUrl) # 向图片地址发送请求
data = res.content # 二进制数据
with open(imagePath, 'wb') as fp:
fp.write(data)
fp.flush()
t_list =[]
for x in range(0,6):
url = 'https://www.douyu.com/gapi/rkc/directory/2_201/%d' % x
get_douyu(url)
#创建一个线程
t1 = threading.Thread(target=get_douyu,args=(url,))#分配任务
t1.start() #开始工作
t_list.append(t1) #所有人加到列表中
for t in t_list:
t.join() #一起干活
贴吧
import time
import requests
head = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36",
'Referer': 'https://tieba.baidu.com/p/1457326101', # 必不可少的Referer 认证
'Cookie': 'bdshare_firstime=1586610526068; BIDUPSID=AC5EA76A9B7F801E663B22CE440C3596; PSTM=1587216498; __yjs_duid=1_a9afdb92ecd78367e6655cb88ddc74f91618279756617; BAIDUID=6969587DE6B159CCAEAABB2C292F16CF:FG=1; BDUSS_BFESS=3RGLWJJOWdKTGRkSXhLVlJPeFpRdVI0RGw3YWVaY3h6NGJUTEpXdDE5OElpfk5nRUFBQUFBJCQAAAAAAAAAAAEAAABKkmgiZ2FvaGo1AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAj-y2AI~stgU; BDUSS=3RGLWJJOWdKTGRkSXhLVlJPeFpRdVI0RGw3YWVaY3h6NGJUTEpXdDE5OElpfk5nRUFBQUFBJCQAAAAAAAAAAAEAAABKkmgiZ2FvaGo1AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAj-y2AI~stgU; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; STOKEN=61cd44dfb4cfb1cf2e5f393188bad74894b4a73bd00dacd2a8ef784b07b4dfc0; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1625989555; st_key_id=17; USER_JUMP=-1; BAIDU_WISE_UID=wapp_1625989668498_113; 577278538_FRSVideoUploadTip=1; video_bubble577278538=1; wise_device=0; tb_as_data=f3a88ff6f315716042874bf86697b220724dd6b356e7ad9a6923fb883b489819dacfeef9380bf64f4a5d5fe97f9bbedaf60e9d0d402f0f3ac206e23642c3dc7bbd7c48f3e355f0b51c9df68cbd923d3fc299b7387817bfdfdf306afde7f21cf8650a0e5c0178bba7dd7a43938d697290; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1625991408; st_data=0000ce93dba7d046132a9fcf1b148856c22ae898efbaec09ad3945ac740abbecaf117243e0cca6effc9725fc7bf3cdfb9809c5344daf2b8e0647a124cb3ab1ad74cf941149c5fec53b01882c85ff2e817881fc13e8dea31df0b87c35da1cc944e68ee9529040e62f9e302d17d37eb746b2bed6ca0d85efe763f2844b65fd57c5; st_sign=852d752e; ab_sr=1.0.1_ZjA4YThmYTVlOTk4M2M5OGE4ZjBlYjgyNjMyOTg0MTEyNzgwYmE1OWRlZmFjYWM3NjFhOTA1YTNjNzc1NDUzMjE0Yzc0Y2RlOTA2MzZiNjc3ZDQ2MDRmYmZhN2RlNTU3YWQ0ZDc1MzE3ODZlZGExODQyNDc0MjVlZTczZmFjNDQ2YzMyYjAyZTUyZWI5OTRjZDA0YzM4OTBhM2VmOWQ3YQ==',
}
head1 = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36",
'Host': 'imgsrc.baidu.com',
'Cookie': 'bdshare_firstime=1586610526068; BIDUPSID=AC5EA76A9B7F801E663B22CE440C3596; PSTM=1587216498; __yjs_duid=1_a9afdb92ecd78367e6655cb88ddc74f91618279756617; BAIDUID=6969587DE6B159CCAEAABB2C292F16CF:FG=1; BDUSS_BFESS=3RGLWJJOWdKTGRkSXhLVlJPeFpRdVI0RGw3YWVaY3h6NGJUTEpXdDE5OElpfk5nRUFBQUFBJCQAAAAAAAAAAAEAAABKkmgiZ2FvaGo1AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAj-y2AI~stgU; BDUSS=3RGLWJJOWdKTGRkSXhLVlJPeFpRdVI0RGw3YWVaY3h6NGJUTEpXdDE5OElpfk5nRUFBQUFBJCQAAAAAAAAAAAEAAABKkmgiZ2FvaGo1AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAj-y2AI~stgU; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; STOKEN=61cd44dfb4cfb1cf2e5f393188bad74894b4a73bd00dacd2a8ef784b07b4dfc0; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1625989555; st_key_id=17; USER_JUMP=-1; BAIDU_WISE_UID=wapp_1625989668498_113; 577278538_FRSVideoUploadTip=1; video_bubble577278538=1; wise_device=0; tb_as_data=f3a88ff6f315716042874bf86697b220724dd6b356e7ad9a6923fb883b489819dacfeef9380bf64f4a5d5fe97f9bbedaf60e9d0d402f0f3ac206e23642c3dc7bbd7c48f3e355f0b51c9df68cbd923d3fc299b7387817bfdfdf306afde7f21cf8650a0e5c0178bba7dd7a43938d697290; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1625991408; st_data=0000ce93dba7d046132a9fcf1b148856c22ae898efbaec09ad3945ac740abbecaf117243e0cca6effc9725fc7bf3cdfb9809c5344daf2b8e0647a124cb3ab1ad74cf941149c5fec53b01882c85ff2e817881fc13e8dea31df0b87c35da1cc944e68ee9529040e62f9e302d17d37eb746b2bed6ca0d85efe763f2844b65fd57c5; st_sign=852d752e; ab_sr=1.0.1_ZjA4YThmYTVlOTk4M2M5OGE4ZjBlYjgyNjMyOTg0MTEyNzgwYmE1OWRlZmFjYWM3NjFhOTA1YTNjNzc1NDUzMjE0Yzc0Y2RlOTA2MzZiNjc3ZDQ2MDRmYmZhN2RlNTU3YWQ0ZDc1MzE3ODZlZGExODQyNDc0MjVlZTczZmFjNDQ2YzMyYjAyZTUyZWI5OTRjZDA0YzM4OTBhM2VmOWQ3YQ=='
}
# 通过requests获取访问的页面
def get_json(url):
r = requests.get(url, headers=head)
if r.status_code != 200: # 如果没有正常获得网页,产生异常
# 200表示请求成功
# 4开头的 状态 表示 浏览器这边出问题 请求方法不对 或者url不对
# 5开头的表示 服务器这边出问题 代码出bug 服务停止
raise Exception() # 抛异常
return r.json() # 将json信息转成普通的python类型
#获取图片
def get_pic(pic_url, name):
r = requests.get(pic_url, headers=head1)
print(r)
if r.status_code != 200: # 如果没有正常获得网页,产生异常
raise Exception()
filename = pic_url.rsplit('/')[-1] # 图片的名称
print(filename)
with open('girl1/' + name + filename, mode='wb') as sw:
sw.write(r.content) # 将图片 保存下来
print('下载图片:' + filename + '成功!')
# 这个是解析json
def parse_json(json, name):
pic_list = json.get('data').get('pic_list')
for pic in pic_list:
purl = pic.get('purl') # 取出图片地址
# print(purl)
get_pic(purl, name) # 调用上面获取图片的方法 下载下来
time.sleep(5)
# 生成13位的时间戳
def get_timestamp():
t = str(time.time())
return t.replace(".", "")[:-3] # 因为最后的时间是13位
# 主函数
def maindown(url, name):
num = 1
# 下载html页面
json = get_json(url)
# print(json)
# 从页面中提取链接
parse_json(json, name)
if __name__ == '__main__':
for page in range(1, 5):
for i in range(page, page + 5):
start = (i - page) * 40 + 1 + 200 * (page - 1) # 201. 241. 281. 321 361
end = 200 * (page - 1) + (i - page + 1) * 40 # 240 280 320 360 400 440. 3*40
ts = get_timestamp()
url = f'https://tieba.baidu.com/photo/g/bw/picture/list?kw=%E5%88%98%E8%AF%97%E8%AF%97&alt=jview&rn=200&tid=3263751314&pn={page}&ps={start}&pe={end}&info=1&_={ts}'
maindown(url, '刘诗诗')
time.sleep(5)
此处应该有图------但是发不出去
将网站改为赵li颖的图片集:
url = f'https://tieba.baidu.com/photo/g/bw/picture/list?kw=%E8%B5%B5%E4%B8%BD%E9%A2%96&alt=jview&rn=200&tid=4174050087&pn={page}&ps={start}&pe={end}&info=1&_={ts}'
db电影
import requests
import csv
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
}
#解析完了以后 写入csv文件
file = open('豆瓣电影.csv', 'w',encoding='utf-8-sig',newline='')
writer = csv.writer(file)
writer.writerow(('影片名称','评分','海报'))
for x in range(10):
url = f'https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start={x*20}'
#print(url)
res = requests.get(url,headers=headers) # 发送请求 获取网页源代码
print(res)
data = res.json() # json 类型数据
data_list = data.get('subjects') # json 转普通python类型 列表
#print(data_list)
for data in data_list:
title = data['title'] #电影名
rate = data['rate'] #评分
cover = data['cover'] #海报地址
movielist = [title,rate,cover]
print(movielist)
writer.writerow(movielist)