python爬虫课程笔记

11 Oct 2020

国庆长假公司上了一个新项目,一直没有休息,10月9号开了爬虫班,事情真是一大堆。开班典礼和第一节课还是挺简单的,代码都没敲。第一节课主要讲了端口的概念,通讯协议,数据拆包,数据封包,HTTPS,HTTP的请求与相应,HTTP请求示例,爬虫的优势和爬虫的分类。

第二节课

开始敲代码了。 get和post,url的组成(尤其是16进制的汉字编码)user-agent,爬虫和反爬虫机制。refer 状态码,抓包工具,下面是重点:urllib.request, urlopen,request(),read() urllib.parse urlencode 和decode

import requests

url=‘https://ss3.bdstatic.com/70cFv8Sh_Q1YnxGkpoWK1HF6hhy/it/u=2534506313,1688529724&fm=26&gp=0.jpg’

req = requests.get(url)

fn=open(“code.png”,“wb”)

fn.write(req.content)

fn.close()

#方式一

import requests

url = ‘https://ss3.bdstatic.com/70cFv8Sh_Q1YnxGkpoWK1HF6hhy/it/u=2534506313,1688529724&fm=26&gp=0.jpg’

req = requests.get(url)

fn = open(‘code.png’,‘wb’)

fn.write(req.content)

fn.close()

#方式1.1

with open(“code2.jpg”,“wb”) as f:

f.write(req.content)

#方式二

from urllib import request

url=‘https://ss3.bdstatic.com/70cFv8Sh_Q1YnxGkpoWK1HF6hhy/it/u=2534506313,1688529724&fm=26&gp=0.jpg’

request.urlretrieve(url,“code3.png”)

import urllib.request

# #获取相应对象urlopen() 不支持重构user-agent

# response = urllib.request.urlopen(“https://www.baidu.com”)

# #用read()把对象的内容读取出来

# #decode() byte数据类型转化为str, encode 把str数据转为bytes数据

# html=response.read().decode(“utf-8”)

# print(type(html),html)

url=“https://www.baidu.com”

# headers 里建议加cookie

# headers={“User-Agent: Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Mobile Safari/537.36”}

headers = {

‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’

}

# # 创建了请求的对象

req = urllib.request.Request(url,headers=headers)

#获取相应对象urlopen

res = urllib.request.urlopen(req)

html = res.read().decode(“utf-8”)

#print(html)

print(res.getcode()) # 返回状态码

print(res.geturl()) #返回我们实际请求的url

海贼王的16进制编码:%E6%B5%B7%E8%B4%BC%E7%8E%8B。 3个% 是一个汉字

#url=“https://www.baidu.com/s?wd=%E6%B5%B7%E8%B4%BC%E7%8E%8B”
import urllib.parse
import urllib.request

te={“wd”:“海贼王”}

result=urllib.parse.urlencode(te)

print(result) # 打印结果就是海贼王的16进制编码

#搜索一个内容,把这个数据保存到本地html
baseurl=“https://www.baidu.com/s?”

key=input(“请输入要搜索的内容:”)
#进行编码
w= {“wd”:key}
k=urllib.parse.urlencode(w)

#拼接url
url=baseurl+k
#print(url)
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’,
‘cookie’:‘BAIDUID=BD7E1E18524FFC27F134FC0750F2A3B8:FG=1; BIDUPSID=BD7E1E18524FFC27F134FC0750F2A3B8; PSTM=1588334179; BDUSS=3psc29UUUl0Yy14MkF6MFRXMmR0dVRjejc5MDlsQ2tEY2lqajc1NHY1NWNKdnRlRVFBQUFBJCQAAAAAAAAAAAEAAABvEP03d2FycmVuenp5AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyZ015cmdNeM; BDUSS_BFESS=3psc29UUUl0Yy14MkF6MFRXMmR0dVRjejc5MDlsQ2tEY2lqajc1NHY1NWNKdnRlRVFBQUFBJCQAAAAAAAAAAAEAAABvEP03d2FycmVuenp5AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFyZ015cmdNeM; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; delPer=0; PSINO=3; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[CLK3Lyfkr9D]=mk3SLVN4HKm; H_PS_PSSID=32814_32617_1443_32788_7544_32705_32230_7517_32116_32719_22159’
}
req = urllib.request.Request(url,headers=headers)
res = urllib.request.urlopen(req)
html=res.read().decode(“utf-8”)

写入文件

with open(“search.html”,“w”,encoding=“utf-8”) as f:
f.write(html)

#wd=方式
baseurl = ‘https://www.baidu.com/s?wd=’

key = input(‘请输入你要搜索的内容:’)

k = urllib.parse.quote(key)

url = baseurl + k

headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’,‘Cookie’:‘BIDUPSID=23F0C104655E78ACD11DB1E20FA56630; PSTM=1592045183; BD_UPN=12314753; sug=0; sugstore=0; ORIGIN=0; bdime=0; BAIDUID=23F0C104655E78AC9F0FB18960BCA3D3:SL=0:NR=10:FG=1; BDUSS=ldxR1FyQ2FEaVZ5UWFjTDlRbThVZHJUQTY1S09PSU81SXlHaUpubVpEY0FMakZmRVFBQUFBJCQAAAAAAAAAAAEAAADzvSajSjdnaGgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAChCV8AoQlfb; BDUSS_BFESS=ldxR1FyQ2FEaVZ5UWFjTDlRbThVZHJUQTY1S09PSU81SXlHaUpubVpEY0FMakZmRVFBQUFBJCQAAAAAAAAAAAEAAADzvSajSjdnaGgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAChCV8AoQlfb; MCITY=-158%3A; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BD_HOME=1; delPer=0; BD_CK_SAM=1; PSINO=6; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[CLK3Lyfkr9D]=mk3SLVN4HKm; COOKIE_SESSION=204_0_5_9_4_6_0_0_5_4_0_0_533_0_0_0_1602246393_0_1602250500%7C9%2369429_193_1601361993%7C9; H_PS_PSSID=32757_32617_1428_7566_7544_31660_32723_32230_7517_32116_32718; H_PS_645EC=ab4cD3QpA7yZJBKDrrzZqesHzhDrwV%2BYww0WVHtmGJ3Adcj0qvjZIVV%2F9q4’
}

创建请求对象

req = urllib.request.Request(url,headers=headers)

获取响应对象

res = urllib.request.urlopen(req)

读取响应对象

html = res.read().decode(‘utf-8’)

写入文件

with open(‘搜索3.html’,‘w’,encoding=‘utf-8’) as f:
f.write(html)
爬虫第二节有代码的课程,总第三节课。
时间过得真快,加上开班典礼,爬虫都上了4节课了。
本节课主要讲了urllib.parse 模块的用法,常用方法,get和post两种请求方式。
然后用代码,函数,类三种方式写了同一种代码。最后介绍了比较简单使用的request模块,响应方法,request设置代理,SSL。爬虫的代码比数据分析的长太多。估计一个笔记都写不完。
import urllib.request
import urllib.parse

headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’
}

主体架构 get 的特点,查询参数在url 地址中显示。post在request方法中添加data参数,表单数据。

#post查询参数和需要提交数据是隐藏在form表单里,不会在url地址上显示出来
#以bytes类型提交,不能是str
name=input(“请输入贴吧的名字:”)
begin=int(input(“请输入起始页:”))
end=int(input(“请输入结束页:”))
#从新赋值
kw={“kw”:name}
kw=urllib.parse.urlencode(kw)
#拼接url 发请求,获相应
for i in range(begin, end+1):
pn=(i-1)*50
#print(pn) “https://tieba.baidu.com/f?kw=%???&pn=0”
baseurl=“https://tieba.baidu.com/f?”
url=baseurl+kw+"&pn="+str(pn)
#print(url)
#发起请求
req=urllib.request.Request(url,headers=headers)
res=urllib.request.urlopen(req)
html=res.read().decode(‘utf-8’)
#写入文件
filename=“第”+str(i)+“页.html”
with open(filename,‘w’,encoding=‘utf-8’) as f:
f.write(html)
import urllib.request
import urllib.parse

def readpage(url):
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36’
}
req=urllib.request.Request(url,headers=headers)
res=urllib.request.urlopen(req)
html=res.read().decode(‘utf-8’)
return html

def writepage(filename,html):
with open(filename,‘w’,encoding=‘utf-8’) as f:
f.write(html)

主函数

def main():

name=input("请输入贴吧的名字:")
begin=int(input("请输入起始页:"))
end=int(input("请输入结束页:"))
#从新赋值
kw={"kw":name}
kw=urllib.parse.urlencode(kw)

for i in range(begin, end+1):
    pn=(i-1)*50
    #print(pn) "https://tieba.baidu.com/f?kw=%???&pn=0"
    baseurl="https://tieba.baidu.com/f?"
    url=baseurl+kw+"&pn="+str(pn)
#调用函数
    html=readpage(url)
    filename="第"+str(i)+"页.html"
    writepage(filename,html)

if name == “main”:
main()
import urllib.request
import urllib.parse

class BaiduSpider():
#把常用的不变的,放到init方法里面
def init(self):

    self.headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
    }
    self.baseurl="https://tieba.baidu.com/f?"




def readpage(self,url):
    req = urllib.request.Request(url, headers=self.headers)
    res = urllib.request.urlopen(req)
    html = res.read().decode('utf-8')
    return html

def writepage(self,filename,html):
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(html)
        print("write succussfully")

def main(self):
    name = input("请输入贴吧的名字:")
    begin = int(input("请输入起始页:"))
    end = int(input("请输入结束页:"))
    # 从新赋值
    kw = {"kw": name}
    kw = urllib.parse.urlencode(kw)


    for i in range(begin, end + 1):
        pn = (i - 1) * 50
        # print(pn) "https://tieba.baidu.com/f?kw=%???&pn=0"
        #baseurl = "https://tieba.baidu.com/f?&
  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值