使用爬虫爬取浏览器
使用request访问网站
import requests
# url = 'https://www.baidu.com'
# 发起请求
# response = requests.get(url)
# 返回源代码 可能会乱码 需要处理
# print(response.text)
# content 二进制字节码 使用utf-8解析二进制字节码
# print(response.content.decode('UTF-8'))
# 查看网站编码
# print(response.encoding)
配置headers来爬取反爬虫网站
# 给一个headers标识为chrome浏览器访问的 ,这样知乎就允许访问了
url = 'https://www.zhihu.com'
# user-agent 用户代理,用户浏览器的表示.request没有设定的时候,(python-requests 用这个,大部分网站都访问不到),如果设定了user-agent,则用你配置的忽悠别人的属性来访问
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
}
response = requests.get(url, headers=headers)
print(response)
get请求的两种拼接参数的方法
# 两种拼接参数的方法
url1 = 'https://www.antvv.com/?id=14'
url2 = 'https://www.antvv.com/'
# params参数
params = {
"id": 14
}
response = requests.get(url2, params=params)
print(response.text)
爬取视频和图片
# 请求资源文件(图片 视频 音频)
# 爬取视频 这个url是访问资源的时 才能爬取到视频
# url = "https://saas.jialingmm.net/code.php?type=sina&vid=79087457&userlink=http%3A%2F%2Fwww.imomoe.in%2Fplayer%2F5040-1-0.html&adress=HeNan"
# # resp = requests.get(url, stream=True)
# # with open('./data/xxx.mp4', 'wb') as file:
# # for j in resp.iter_content(1024*10):
# # file.write(j)
# 爬取图片
url = "https://i0.hdslb.com/bfs/sycp/creative_img/202007/b9401d1fecb4dcc24d8154271d9c1f17.jpg"
resp = requests.get(url)
with open('./data/wl.png', 'wb') as file:
file.write(resp.content)
作业
爬取一个页面 保存到本地的index.html里
# 爬取一个页面 保存到本地的index.html里
import requests
url = 'https://www.bilibili.com/'
resp = requests.get(url)
html = resp.content.decode('utf-8')
# 出现错误 UnicodeEncodeError: 'gbk' codec can't encode character '\uff62' in position 4865: illegal multibyte sequence
# 解决方案:在windows下面,新文件的默认编码是gbk,这样的话,python解释器会用gbk编码去解析我们的网络数据流txt,
# 然而txt此时已经是decode过的unicode编码,这样的话就会导致解析不了,出现上述问题。 解决的办法就是,改变目标文件的编码
with open('./data/index.html', 'w', encoding='utf-8') as file:
file.write(html)
爬取两个图片并保存在本地
import requests
# 爬取两个图片并保存在本地
url1 = 'http://i0.hdslb.com/bfs/bangumi/image/08e401fa1e02cfb1be175a1bc3ac819b2e1e3a0b.jpg@87w_88h_1c_100q.webp'
url2 = 'http://i0.hdslb.com/bfs/bangumi/image/b3211f93d0d72cc720a6ae88d99e43a7b4f8a139.png@87w_88h_1c_100q.webp'
urls = []
urls.append(url1)
urls.append(url2)
for i in range(len(urls)):
resp = requests.get(urls[i])
with open('./data/'+str(i)+'.jpg', 'wb') as file:
file.write(resp.content)
爬取网站中所有的图片路径并下载
import re
import requests
url = 'http://www.gaokao.com/gkpic/'
resp = requests.get(url)
html = resp.content.decode('gbk')
# print(html)
# <img src="(.*)" alt="">
# 使用正则得到所有的图片url
urls = re.findall('<img src="(.*)" alt=".*">', html)
# <p class="p1">(.*)</p>
titles = re.findall('<p class="p1">(.*?)</p>', html)
# print(titles)
for i in range(0, len(urls)):
# 用二进制格式打开文件
with open('./data/img/'+titles[i]+'.jpg', 'wb') as file:
# 再次请求url 拿到图片
resp = requests.get(urls[i])
# 保存图片
file.write(resp.content)
print('第{}张图片已经下载完成'.format(i))
对request和response模块总结
request
import requests
# response = requests.get(url,headers=,params=,stream=Bool)
# url:网址,协议://网址
# # headers 请求的时候附加的消息头,User-Agent(用户表示),Referer(来源页)
# # params 请求附带的参数,也可以通过?号的方式,直接拼接在url后面
# stream 是否使用流进行传输,如果开启的话,在请求页面的时候,只得到返回得消息头,内容部分,一部分一部分得下载
# proxies 设置代理
# timeout 设置超时得时间
# verify 是否强制验证证书
response
# 返回值 response
# status_code 返回的状态码
# encoding 网页的编码
# text 返回网页的源代码,可以自动的去解析中文
# content 返回内容的字节码,可以使用decode()解析中文
# json() 把json字符串编程python的数据类型
json与python的转换
import json
# json->python json.loads(Json) 把Json字符串转化为Python数据类型
# python->json json.dumps(Python) 把Python数据类型转换未Json字符串
# json_str = '''{"name":"小明","age":19}'''
# json_python = json.loads(json_str)
# print(type(json_python))
# print(type(json_str))
#
# infos = {"chinese":80,'math':90}
# str1 = json.dumps(infos)
# print(type(str1))
# print(type(infos))
设置代理爬虫
import requests
url = "http://httpbin.org/get"
proxies = {
'http': '128.199.81.100:44321'
}
#proxies设置代理 timeout设置超时时间
r = requests.get(url, proxies=proxies, timeout=5)
print(r.text)
result = r.json()
print(result)
print(type(result))# <class 'dict'>
print(result['origin'])
post请求
post请求模拟登录
#post请求模拟登录
data = {
"username": "13477296494",
"password": "tt9706282"
}
url = "http://www.gearmk.cn/loginPage"
resp = requests.post(url,data = data)
print(resp.text)
用post请求发送文件
# 用post请求发送文件
url = 'http://httpbin.org/post'
data = {
'username': '123',
'Lain': '555'
}
files = {
'img': open('./data/wl.png', "rb")
}
resp = requests.post(url, data=data, files=files)
print(resp.text)
Cookies
import requests
# 使用request模拟登录
url = 'https://www.woyaogexing.com/e/member/ddoaction.php'
data = {
"ecmsfrom": "",
"enews": "login",
"useraccount": "13477296494",
"password": "123456",
"lifetime": "2592000",
"Submit": "登录"
}
resp = requests.post(url, data=data)
# 查看cookies
coo = resp.cookies
#print(resp.text)
url2 = 'https://www.woyaogexing.com/e/member/msg/AddMsg/?username=wygx_96494'
# 未设置cookies
# response = requests.get(url2)
# # print(response.text)
# 设置cookies
response = requests.get(url2, cookies=coo)
print(response.text)
模拟登录
模拟百度登陆
import requests
cookies = {
"BAIDUI": "C628F6100D67DAAA38EE799AC7D55622",
"F": "1",
"BIDUPSI": "C628F6100D67DAAA38EE799AC7D55622",
"PST": "1592218093",
"BDUS": "FLLUszWWZXfkxidUhufjd1V2UzYTV6Sk9hdC1yYlpmcGtJenJpRE5hUk5DUlZmRVFBQUFBJCQAAAAAAAAAAAEAAAB5xMSA1r-wrszsvfK35wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAE187V5NfO1eN0",
"cfla": "13%3A3",
"BD_UP": "1a314753",
"BDRCVFR[u0HWrfoqje0": "mk3SLVN4HKm",
"BD_HOM": "",
"H_PS_PSSI": "31905_1441_31325_32139_32231_32257"
}
url = 'https://www.baidu.com/?tn=78040160_14_pg&ch=16'
resp = requests.get(url, cookies=cookies)
print(resp.text)
模拟哔哩哔哩登陆
import requests
cookies = {
"_uui": "823C561A-B49B-1EE2-74BC-DFB23CFF37A509660infoc",
"buvid": "76823765-83CD-4D05-B26D-B80C0FF644E7155842infoc",
"si": "i78mph55",
"DedeUserI": "18686436",
"DedeUserID__ckMd": "d383d19f52e573e1",
"SESSDAT": "56eddd60%2C1607733022%2C5ffcf*61",
"bili_jc": "753b4ec5dd21dc83d4e11e917d93184b",
"CURRENT_FNVA": "16",
"rpdi": "|(J~R~J|lRul0J'ulmkumY~k|",
"LIVE_BUVI": "AUTO9815921979956077",
"CURRENT_QUALIT": "80",
"PVI": "1",
"bp_video_offset_1868643": "410528496169982512"
}
url = 'https://www.bilibili.com/'
response = requests.get(url, cookies=cookies)
print(response.text)
BS4
BS4重要的两个API——find_all()&attrs[]
find_all(tagName, attrs={}, text=) 返回的是数组
attrs[‘属性名’]
from bs4 import BeautifulSoup
# find_all()
import requests
url = 'http://www.gaokao.com/gkpic/'
resp = requests.get(url)
# 网址的源码
txt = resp.content.decode('gbk')
# fixme 使用BS4解析一下
soup = BeautifulSoup(txt, 'html.parser')
# 解析
# find_all(tagName, attrs={}, text=) 返回的是数组
# 根据tagName标签名进行查找 attrs属性值进行查找 text文本值进行查找
ass = soup.find_all('div', attrs={"id": "imgall"})[0].find_all('a')
print(len(ass))
# attrs['属性名'] 获取标签内的属性
#print(ass[0].attrs['title'])
#print(ass[0].find_all('img')[0].attrs['src'])
# for i in range(0, len(ass)):
# print(ass[i].find_all('img')[0].attrs['src'])
for item in ass:
# 拿到a标签下面的img,img里面的属性src
img_src = item.find_all('img')[0].attrs['src']
# 获得文件的名称
name = item.find_all('img')[0].attrs['alt']
# 访问这个路径 下载就行
resp = requests.get(img_src)
with open('./data/img/'+name+'.jpg', 'wb') as file:
file.write(resp.content)
print(img_src)
BS4中其他API
from bs4 import BeautifulSoup
import requests
import re
url = 'http://www.gaokao.com/gkpic/'
# 拿到请求
resp = requests.get(url)
# 网址源代码
html = resp.content.decode('gbk')
# 使用bs解析 拿到返回值
html = soup = BeautifulSoup(html, 'html.parser')
# 1.通过.的方式 去查找节点,但是同名的标签只能找到第一个
# print(html.body.link)
# 2.通过搜索方式查找 find_all(tagName,attrs,text)
# 查找具有class为wrapper的div
# print(len(html.find_all('div', attrs={'class': 'wrapper'})))
# 查找所有的div
# print(html.find_all('div'))
# text通过文本查找,返回的是列表 包含的是文本的对象
# print(html.find_all(text=re.compile('上海')))
# 连贯操作:在查找的结果中再次使用find_all
# print(len(html.find_all('div',attrs={'id':'imgall','class':'tushuogaokao clearfix'})[0].find_all('a')))
# limit在访问前n个可以用到|可以使用数组操作来代替limit,数组操作更能寻找从n到m个
# print(html.find_all(text=re.compile('上海'),limit=3))
# print(html.find_all(text=re.compile('上海'))[1:3])
# find()查找对应的源码中的内容的一个(F12查看网页的代码中可能某些代码通过ajax动态加入的)
# print(html.find('div'))
# contents 返回某个节点下的所有的子节点
# print(html.head.contents)
# parent 返回某节点的父亲节点
# print(html.body.div.parent.parent) 返回的是body的parent:html
# descendants 返回某个节点下的所有子孙节点
# print(list(html.body.div.descendants))
# print(html.body.div.contents)
保存每个人的系列图片
import requests
from bs4 import BeautifulSoup
import os
url = 'http://www.gaokao.com/gkpic/'
resp = requests.get(url)
html = resp.content.decode('gbk')
# fixme 通过bs4解析
soup = BeautifulSoup(html, 'html.parser')
# a标签数组
a_all = soup.find_all('div', attrs={'id': 'imgall'})[0].find_all('a')
# print(a_all[0])
for item in a_all:
# 标题
title = item.find_all('div')[0].find_all('p', attrs={'class': 'p1'})[0].text
# http: // www.gaokao.com / e / 20160201 / 56aed7e340aa9_3.shtml
# http: // www.gaokao.com / e / 20160201 / 56aed7e340aa9_2.shtml
# http: // www.gaokao.com / e / 20160201 / 56aed7e340aa9.shtml
step = 1
while True:
# 获取a标签中的href
url_detail = item.attrs['href']
if step != 1:
url_detail = url_detail[:-6]+'_'+str(step)+'.shtml'
if step == 11:
break
# 访问网址
response = requests.get(url_detail)
# 拿到源码
html_detail = response.content.decode('gbk')
# BS4解析
html_detail = BeautifulSoup(html_detail, 'html.parser')
try:
# 系列图片的url
# url_img = html_detail.find_all('div', attrs={'class': 'main'})[0].find_all('p')[1].find_all('img')[0].attrs['src']
url_img = html_detail.find_all('div', attrs={'class': 'main'})[0].find_all('p', attrs={'style': 'text-align: center;'})[0].find_all('img')[0].attrs['src']
# 获取图片的名称
img_title = html_detail.find_all('h1', attrs={'class': 'bm10'})[0].find_all('a')[0].text
except BaseException as e:
break
with open('./data/detail/'+title+'/'+img_title+'.png', 'wb') as f:
resp = requests.get(url_img)
f.write(resp.content)
print("第"+title+"系列的第{}张图片已经下载完成".format(step))
step += 1
pass
# # 创建文件夹 此代码运行一次即可
# try:
# os.mkdir("./data/detail/" + title)
# except Exception as e:
# continue
print("第"+title+"系列下载完成")
爬取前程无忧的信息并存入mysql数据库中或者存入csv文件
import requests
from bs4 import BeautifulSoup
import pymysql
# 分页
# https://search.51job.com/list/020000,000000,0000,32,9,99,Java,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=
# https://search.51job.com/list/020000,000000,0000,32,9,99,Java,2,2.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=
# https://search.51job.com/list/020000,000000,0000,32,9,99,Java,2,3.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=
# https://search.51job.com/list/020000,000000,0000,32,9,99,Java,2,4.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=
# 不同职位
# https://search.51job.com/list/020000,000000,0000,32,9,99,Java,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=
# https://search.51job.com/list/020000,000000,0000,32,9,99,Python,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
# https://search.51job.com/list/020000,000000,0000,32,9,99,Unity,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
# https://search.51job.com/list/020000,000000,0000,32,9,99,人工智能,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
# 不同地区
# 武汉
# https://search.51job.com/list/180200,000000,0000,32,9,99,Unity,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
# https://search.51job.com/list/010000,000000,0000,32,9,99,Unity,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
# https://search.51job.com/list/040000,000000,0000,32,9,99,Unity,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
# 重庆的
# https://search.51job.com/list/060000,000000,0000,32,9,99,Unity,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
# 将资料保存在数据库中
def save_to_dataSource(job_list):
db = pymysql.connect('localhost', 'login', '123', 'python')
# 游标对象操作数据库
cursor = db.cursor()
for item in job_list:
sql = "insert into job(job_title,com_title,place,money,create_time) values('{}','{}','{}','{}','{}')".format(item['job_title'], item['com_title'], item['place'], item['money'], item['create_time'])
# 执行sql语句
cursor.execute(sql)
# 提交事务
db.commit()
# 将资料保存到csv文件
def save_to_file(job_list):
with open('./data/15/前程无忧信息.csv', 'a', encoding='utf-8') as file:
for item in job_list:
file.write('%s,%s,%s,%s,%s\n'%(item['job_title'], item['com_title'], item['place'], item['money'], item['create_time']))
get_many_page_info(10)
# 获取一页所有的job
def get_one_page_info(url):
response = requests.get(url)
html = response.content.decode('gbk')
html = BeautifulSoup(html, 'html.parser')
jobs = html.find_all('div', attrs={'id': 'resultList'})[0].find_all('div', attrs={'class': 'el'})
# 第一个是职位名称
jobs = jobs[1:]
# 遍历每一个工作
job_list = []
# 遍历每个工作
for job in jobs:
# 工作名称
job_title = job.find_all('p', attrs={'class': 't1'})[0].attrs['title']
job_title = job_title.replace(",", '-')
# 公司名称
com_title = job.find_all('span', attrs={'class': 't2'})[0].find_all('a')[0].attrs['title']
com_title = com_title.replace(",", '-')
# 上班地点
place = job.find_all('span', attrs={'class': 't3'})[0].text
place = place.replace(",", '-')
# 薪水
money = job.find_all('span', attrs={'class': 't4'})[0].text
if money == '':
money = '工资面议'
# 发布时间
create_time = job.find_all('span', attrs={'class': 't5'})[0].text
job_list.append({"job_title": job_title, 'com_title': com_title, 'place': place, 'money': money,
'create_time': create_time})
pass
return job_list
# 可以获取多页的job
def get_many_page_info(page_num):
for page in range(1, page_num+1):
# 获取每一页的url
url = "https://search.51job.com/list/020000,000000,0000,32,9,99,Java,2," + str(
page) + ".html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
# 获取每一页的资料
job_list = get_one_page_info(url)
save_to_dataSource(job_list)
print("第{}页已经爬取完毕".format(page))