爬虫入门知识笔记总结

最新推荐文章于 2024-07-09 14:45:02 发布

轻小說控

最新推荐文章于 2024-07-09 14:45:02 发布

阅读量1.3k

点赞数

分类专栏： python python爬虫文章标签： python python爬虫基础

本文链接：https://blog.csdn.net/Aqours/article/details/107500062

版权

python 同时被 2 个专栏收录

6 篇文章 0 订阅

订阅专栏

python爬虫

1 篇文章 0 订阅

订阅专栏

使用爬虫爬取浏览器

使用request访问网站

import requests

# url = 'https://www.baidu.com'
# 发起请求
# response = requests.get(url)

# 返回源代码  可能会乱码 需要处理
# print(response.text)

# content 二进制字节码  使用utf-8解析二进制字节码
# print(response.content.decode('UTF-8'))

# 查看网站编码
# print(response.encoding)

配置headers来爬取反爬虫网站

 # 给一个headers标识为chrome浏览器访问的 ，这样知乎就允许访问了
 url = 'https://www.zhihu.com'
 # user-agent 用户代理，用户浏览器的表示.request没有设定的时候，(python-requests 用这个，大部分网站都访问不到),如果设定了user-agent,则用你配置的忽悠别人的属性来访问
    
 headers = {
     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
 }
 response = requests.get(url, headers=headers)
 print(response)

get请求的两种拼接参数的方法

# 两种拼接参数的方法
url1 = 'https://www.antvv.com/?id=14'
url2 = 'https://www.antvv.com/'
# params参数
params = {
    "id": 14
}
response = requests.get(url2, params=params)
print(response.text)

爬取视频和图片

# 请求资源文件（图片 视频 音频）
# 爬取视频 这个url是访问资源的时 才能爬取到视频
# url = "https://saas.jialingmm.net/code.php?type=sina&amp;vid=79087457&amp;userlink=http%3A%2F%2Fwww.imomoe.in%2Fplayer%2F5040-1-0.html&amp;adress=HeNan"
# # resp = requests.get(url, stream=True)
# # with open('./data/xxx.mp4', 'wb') as file:
# #     for j in resp.iter_content(1024*10):
# #         file.write(j)

# 爬取图片
url = "https://i0.hdslb.com/bfs/sycp/creative_img/202007/b9401d1fecb4dcc24d8154271d9c1f17.jpg"
resp = requests.get(url)
with open('./data/wl.png', 'wb') as file:
    file.write(resp.content)

作业

爬取一个页面保存到本地的index.html里

# 爬取一个页面 保存到本地的index.html里

import requests
url = 'https://www.bilibili.com/'
resp = requests.get(url)
html = resp.content.decode('utf-8')
# 出现错误  UnicodeEncodeError: 'gbk' codec can't encode character '\uff62' in position 4865: illegal multibyte sequence
# 解决方案：在windows下面，新文件的默认编码是gbk，这样的话，python解释器会用gbk编码去解析我们的网络数据流txt，
# 然而txt此时已经是decode过的unicode编码，这样的话就会导致解析不了，出现上述问题。 解决的办法就是，改变目标文件的编码
with open('./data/index.html', 'w', encoding='utf-8') as file:
    file.write(html)

爬取两个图片并保存在本地

import requests
# 爬取两个图片并保存在本地
url1 = 'http://i0.hdslb.com/bfs/bangumi/image/08e401fa1e02cfb1be175a1bc3ac819b2e1e3a0b.jpg@87w_88h_1c_100q.webp'
url2 = 'http://i0.hdslb.com/bfs/bangumi/image/b3211f93d0d72cc720a6ae88d99e43a7b4f8a139.png@87w_88h_1c_100q.webp'

urls = []
urls.append(url1)
urls.append(url2)

for i in range(len(urls)):
    resp = requests.get(urls[i])
    with open('./data/'+str(i)+'.jpg', 'wb') as file:
        file.write(resp.content)

爬取网站中所有的图片路径并下载

import re
import requests
url = 'http://www.gaokao.com/gkpic/'

resp = requests.get(url)
html = resp.content.decode('gbk')
# print(html)

# <img src="(.*)" alt="">
# 使用正则得到所有的图片url
urls = re.findall('<img src="(.*)" alt=".*">', html)
# <p class="p1">(.*)</p>
titles = re.findall('<p class="p1">(.*?)</p>', html)
# print(titles)

for i in range(0, len(urls)):
    # 用二进制格式打开文件
    with open('./data/img/'+titles[i]+'.jpg', 'wb') as file:
        # 再次请求url 拿到图片
        resp = requests.get(urls[i])
        # 保存图片
        file.write(resp.content)
        print('第{}张图片已经下载完成'.format(i))

对request和response模块总结

request

import requests

# response = requests.get(url,headers=,params=,stream=Bool)
# url:网址，协议://网址
# # headers 请求的时候附加的消息头，User-Agent（用户表示），Referer（来源页）
# # params 请求附带的参数，也可以通过？号的方式，直接拼接在url后面
# stream 是否使用流进行传输，如果开启的话，在请求页面的时候，只得到返回得消息头，内容部分，一部分一部分得下载
# proxies 设置代理
# timeout 设置超时得时间
# verify 是否强制验证证书

response

# 返回值 response
# status_code 返回的状态码
# encoding 网页的编码
# text 返回网页的源代码，可以自动的去解析中文
# content 返回内容的字节码，可以使用decode()解析中文
# json()  把json字符串编程python的数据类型

json与python的转换

import json
# json->python json.loads(Json) 把Json字符串转化为Python数据类型
# python->json json.dumps(Python) 把Python数据类型转换未Json字符串

# json_str = '''{"name":"小明","age":19}'''
# json_python = json.loads(json_str)
# print(type(json_python))
# print(type(json_str))
#
# infos = {"chinese":80,'math':90}
# str1 = json.dumps(infos)
# print(type(str1))
# print(type(infos))

设置代理爬虫

import requests

url = "http://httpbin.org/get"
proxies = {
    'http': '128.199.81.100:44321'
}
#proxies设置代理 timeout设置超时时间
r = requests.get(url, proxies=proxies, timeout=5)
print(r.text)
result = r.json()
print(result)
print(type(result))# <class 'dict'>
print(result['origin'])

post请求

post请求模拟登录

#post请求模拟登录
data = {
    "username": "13477296494",
    "password": "tt9706282"
}
url = "http://www.gearmk.cn/loginPage"
resp = requests.post(url,data = data)
print(resp.text)

用post请求发送文件

# 用post请求发送文件
url = 'http://httpbin.org/post'
data = {
    'username': '123',
    'Lain': '555'
}
files = {
    'img': open('./data/wl.png', "rb")
}
resp = requests.post(url, data=data, files=files)
print(resp.text)

Cookies

在这里插入图片描述

import  requests
# 使用request模拟登录
url = 'https://www.woyaogexing.com/e/member/ddoaction.php'
data = {
    "ecmsfrom": "",
    "enews": "login",
    "useraccount": "13477296494",
    "password": "123456",
    "lifetime": "2592000",
    "Submit": "登录"
}
resp = requests.post(url, data=data)
# 查看cookies
coo = resp.cookies
#print(resp.text)

url2 = 'https://www.woyaogexing.com/e/member/msg/AddMsg/?username=wygx_96494'
# 未设置cookies
# response = requests.get(url2)
# # print(response.text)
# 设置cookies
response = requests.get(url2, cookies=coo)
print(response.text)

模拟登录

模拟百度登陆

import  requests

cookies = {
    "BAIDUI": "C628F6100D67DAAA38EE799AC7D55622",
    "F": "1",
    "BIDUPSI": "C628F6100D67DAAA38EE799AC7D55622",
    "PST": "1592218093",
    "BDUS": "FLLUszWWZXfkxidUhufjd1V2UzYTV6Sk9hdC1yYlpmcGtJenJpRE5hUk5DUlZmRVFBQUFBJCQAAAAAAAAAAAEAAAB5xMSA1r-wrszsvfK35wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAE187V5NfO1eN0",
    "cfla": "13%3A3",
    "BD_UP": "1a314753",
    "BDRCVFR[u0HWrfoqje0": "mk3SLVN4HKm",
    "BD_HOM": "",
    "H_PS_PSSI": "31905_1441_31325_32139_32231_32257"
}
url = 'https://www.baidu.com/?tn=78040160_14_pg&ch=16'
resp = requests.get(url, cookies=cookies)
print(resp.text)

模拟哔哩哔哩登陆

import requests
cookies = {
    "_uui": "823C561A-B49B-1EE2-74BC-DFB23CFF37A509660infoc",
    "buvid": "76823765-83CD-4D05-B26D-B80C0FF644E7155842infoc",
    "si": "i78mph55",
    "DedeUserI": "18686436",
    "DedeUserID__ckMd": "d383d19f52e573e1",
    "SESSDAT": "56eddd60%2C1607733022%2C5ffcf*61",
    "bili_jc": "753b4ec5dd21dc83d4e11e917d93184b",
    "CURRENT_FNVA": "16",
    "rpdi": "|(J~R~J|lRul0J'ulmkumY~k|",
    "LIVE_BUVI": "AUTO9815921979956077",
    "CURRENT_QUALIT": "80",
    "PVI": "1",
    "bp_video_offset_1868643": "410528496169982512"
}

url = 'https://www.bilibili.com/'
response = requests.get(url, cookies=cookies)
print(response.text)

BS4

BS4重要的两个API——find_all()&attrs[]

find_all(tagName, attrs={}, text=) 返回的是数组

attrs[‘属性名’]

from bs4 import BeautifulSoup

# find_all()
import requests
url = 'http://www.gaokao.com/gkpic/'
resp = requests.get(url)
# 网址的源码
txt = resp.content.decode('gbk')
# fixme 使用BS4解析一下
soup = BeautifulSoup(txt, 'html.parser')

# 解析
# find_all(tagName, attrs={}, text=) 返回的是数组
# 根据tagName标签名进行查找 attrs属性值进行查找 text文本值进行查找

ass = soup.find_all('div', attrs={"id": "imgall"})[0].find_all('a')
print(len(ass))
# attrs['属性名']  获取标签内的属性

#print(ass[0].attrs['title'])
#print(ass[0].find_all('img')[0].attrs['src'])
# for i in range(0, len(ass)):
#     print(ass[i].find_all('img')[0].attrs['src'])

for item in ass:
    # 拿到a标签下面的img，img里面的属性src
    img_src = item.find_all('img')[0].attrs['src']
    # 获得文件的名称
    name = item.find_all('img')[0].attrs['alt']
    # 访问这个路径 下载就行
    resp = requests.get(img_src)
    with open('./data/img/'+name+'.jpg', 'wb') as file:
        file.write(resp.content)
    print(img_src)

BS4中其他API

from bs4 import BeautifulSoup
import requests
import re

url = 'http://www.gaokao.com/gkpic/'
# 拿到请求
resp = requests.get(url)
# 网址源代码
html = resp.content.decode('gbk')
# 使用bs解析  拿到返回值
html = soup = BeautifulSoup(html, 'html.parser')
# 1.通过.的方式 去查找节点，但是同名的标签只能找到第一个
# print(html.body.link)

# 2.通过搜索方式查找 find_all(tagName,attrs,text)
# 查找具有class为wrapper的div
# print(len(html.find_all('div', attrs={'class': 'wrapper'})))
# 查找所有的div
# print(html.find_all('div'))
# text通过文本查找，返回的是列表 包含的是文本的对象
# print(html.find_all(text=re.compile('上海')))


# 连贯操作：在查找的结果中再次使用find_all
# print(len(html.find_all('div',attrs={'id':'imgall','class':'tushuogaokao clearfix'})[0].find_all('a')))

# limit在访问前n个可以用到|可以使用数组操作来代替limit，数组操作更能寻找从n到m个
# print(html.find_all(text=re.compile('上海'),limit=3))
# print(html.find_all(text=re.compile('上海'))[1:3])

# find()查找对应的源码中的内容的一个(F12查看网页的代码中可能某些代码通过ajax动态加入的)
# print(html.find('div'))

# contents 返回某个节点下的所有的子节点
# print(html.head.contents)

# parent 返回某节点的父亲节点
# print(html.body.div.parent.parent) 返回的是body的parent：html

# descendants 返回某个节点下的所有子孙节点
# print(list(html.body.div.descendants))
# print(html.body.div.contents)

保存每个人的系列图片

import requests
from bs4 import BeautifulSoup
import os

url = 'http://www.gaokao.com/gkpic/'
resp = requests.get(url)
html = resp.content.decode('gbk')
# fixme 通过bs4解析
soup = BeautifulSoup(html, 'html.parser')

# a标签数组
a_all = soup.find_all('div', attrs={'id': 'imgall'})[0].find_all('a')
# print(a_all[0])

for item in a_all:
    # 标题
    title = item.find_all('div')[0].find_all('p', attrs={'class': 'p1'})[0].text

    # http: // www.gaokao.com / e / 20160201 / 56aed7e340aa9_3.shtml
    # http: // www.gaokao.com / e / 20160201 / 56aed7e340aa9_2.shtml
    # http: // www.gaokao.com / e / 20160201 / 56aed7e340aa9.shtml

    step = 1
    while True:
        # 获取a标签中的href
        url_detail = item.attrs['href']
        if step != 1:
            url_detail = url_detail[:-6]+'_'+str(step)+'.shtml'
        if step == 11:
            break
        # 访问网址
        response = requests.get(url_detail)
        # 拿到源码
        html_detail = response.content.decode('gbk')
        # BS4解析
        html_detail = BeautifulSoup(html_detail, 'html.parser')

        try:
            # 系列图片的url
            # url_img = html_detail.find_all('div', attrs={'class': 'main'})[0].find_all('p')[1].find_all('img')[0].attrs['src']
            url_img = html_detail.find_all('div', attrs={'class': 'main'})[0].find_all('p', attrs={'style': 'text-align: center;'})[0].find_all('img')[0].attrs['src']
            # 获取图片的名称
            img_title = html_detail.find_all('h1', attrs={'class': 'bm10'})[0].find_all('a')[0].text
        except BaseException as e:
            break

        with open('./data/detail/'+title+'/'+img_title+'.png', 'wb') as f:
            resp = requests.get(url_img)
            f.write(resp.content)
        print("第"+title+"系列的第{}张图片已经下载完成".format(step))
        step += 1
        pass
        # # 创建文件夹 此代码运行一次即可
        # try:
        #     os.mkdir("./data/detail/" + title)
        # except Exception as e:
        #     continue
    print("第"+title+"系列下载完成")

爬取前程无忧的信息并存入mysql数据库中或者存入csv文件

import requests
from bs4 import BeautifulSoup
import pymysql


# 分页
# https://search.51job.com/list/020000,000000,0000,32,9,99,Java,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=
# https://search.51job.com/list/020000,000000,0000,32,9,99,Java,2,2.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=
# https://search.51job.com/list/020000,000000,0000,32,9,99,Java,2,3.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=
# https://search.51job.com/list/020000,000000,0000,32,9,99,Java,2,4.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=

# 不同职位
# https://search.51job.com/list/020000,000000,0000,32,9,99,Java,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=
# https://search.51job.com/list/020000,000000,0000,32,9,99,Python,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
# https://search.51job.com/list/020000,000000,0000,32,9,99,Unity,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
# https://search.51job.com/list/020000,000000,0000,32,9,99,人工智能,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=

# 不同地区
# 武汉
# https://search.51job.com/list/180200,000000,0000,32,9,99,Unity,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
# https://search.51job.com/list/010000,000000,0000,32,9,99,Unity,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
# https://search.51job.com/list/040000,000000,0000,32,9,99,Unity,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
# 重庆的
# https://search.51job.com/list/060000,000000,0000,32,9,99,Unity,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=


# 将资料保存在数据库中
def save_to_dataSource(job_list):
    db = pymysql.connect('localhost', 'login', '123', 'python')
    # 游标对象操作数据库
    cursor = db.cursor()
    for item in job_list:
        sql = "insert into job(job_title,com_title,place,money,create_time) values('{}','{}','{}','{}','{}')".format(item['job_title'], item['com_title'], item['place'], item['money'], item['create_time'])
        # 执行sql语句
        cursor.execute(sql)
        # 提交事务
        db.commit()
        
# 将资料保存到csv文件
def save_to_file(job_list):
    with open('./data/15/前程无忧信息.csv', 'a', encoding='utf-8') as file:
        for item in job_list:
            file.write('%s,%s,%s,%s,%s\n'%(item['job_title'], item['com_title'], item['place'], item['money'], item['create_time']))

get_many_page_info(10)

# 获取一页所有的job
def get_one_page_info(url):
    response = requests.get(url)
    html = response.content.decode('gbk')
    html = BeautifulSoup(html, 'html.parser')

    jobs = html.find_all('div', attrs={'id': 'resultList'})[0].find_all('div', attrs={'class': 'el'})
    # 第一个是职位名称
    jobs = jobs[1:]
    # 遍历每一个工作
    job_list = []
    # 遍历每个工作
    for job in jobs:
        # 工作名称
        job_title = job.find_all('p', attrs={'class': 't1'})[0].attrs['title']
        job_title = job_title.replace(",", '-')
        # 公司名称
        com_title = job.find_all('span', attrs={'class': 't2'})[0].find_all('a')[0].attrs['title']
        com_title = com_title.replace(",", '-')
        # 上班地点
        place = job.find_all('span', attrs={'class': 't3'})[0].text
        place = place.replace(",", '-')
        # 薪水
        money = job.find_all('span', attrs={'class': 't4'})[0].text
        if money == '':
            money = '工资面议'
        # 发布时间
        create_time = job.find_all('span', attrs={'class': 't5'})[0].text
        job_list.append({"job_title": job_title, 'com_title': com_title, 'place': place, 'money': money,
                         'create_time': create_time})
        pass
    return job_list

# 可以获取多页的job
def get_many_page_info(page_num):
    for page in range(1, page_num+1):
        # 获取每一页的url
        url = "https://search.51job.com/list/020000,000000,0000,32,9,99,Java,2," + str(
            page) + ".html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
        # 获取每一页的资料
        job_list = get_one_page_info(url)
        save_to_dataSource(job_list)
        print("第{}页已经爬取完毕".format(page))

轻小說控

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
爬虫入门知识笔记总结

使用爬虫爬取浏览器使用request访问网站import requests# url = 'https://www.baidu.com'# 发起请求# response = requests.get(url)# 返回源代码可能会乱码需要处理# print(response.text)# content 二进制字节码使用utf-8解析二进制字节码# print(response.content.decode('UTF-8'))# 查看网站编码# print(respon
复制链接

扫一扫