下面是关于爬虫的笔记。
import urllib.request
import urllib.parse
# url的组成
# http/https 协议
# www.baidu.com 主机
# http 80 https 443 端口号
# 路径s 参数wd= 锚点
url = 'https://www.baidu.com'
# 返回数据不完整,反扒,需要进行伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.41 Safari/537.36 Edg/101.0.1210.32'
}
# 因为url的open方法中不能存放字典,所以headers不能传递进去
# 请求对象的定制
# 参数顺序的问题,中间还有个date,所以需要关键字传参
request = urllib.request.Request(url,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
print(content)
# 需求:获取周杰伦的网页源码
url = 'https://www.baidu.com/s?wd='
# 请求对象的定制为了解决反爬的第一种手段
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.41 Safari/537.36 Edg/101.0.1210.32'
}
# 编解码!!!
# 将周杰伦三个字转化未unicode的编码格式,需要urllib.parse
# 可以与url进行拼接
name = urllib.parse.quote('周杰伦')
url = url + name
# 请求对象的定制
request = urllib.request.Request(url=url,headers=headers)
# 模拟浏览器向服务器发送请求
response = urllib.request.urlopen(request)
# 获取响应内容
content = response.read().decode('utf-8')
# 打印
print(content)
# urlencode应用场景:多个参数的时候
# https://www.baidu.com/s?wd=周杰伦&sex=男
# data = {
# 'wd':'周杰伦','sex':'男','location':'中国台湾'
# }
# a = urllib.parse.urlencode(data)
# print(a)
# 获取网页源码
base = 'https://www.baidu.com/s?'
data = {
'wd':'周杰伦','sex':'男','location':'中国台湾'
}
new_data = urllib.parse.urlencode(data)
# 请求资源路径
url = base + new_data
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.41 Safari/537.36 Edg/101.0.1210.32'
}
# 请求对象的定制
request = urllib.request.Request(url=url,headers=headers)
# 模拟浏览器向服务器发送请求
response = urllib.request.urlopen(request)
# 获取网页源码的数据
content = response.read().decode('utf-8')
print(content)
# post请求
url = 'https://fanyi.baidu.com/sug'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.41 Safari/537.36 Edg/101.0.1210.32'
}
data = {
'kw':'spider'
}
# post的请求的参数必须要进行编码
data = urllib.parse.urlencode(data).encode('utf-8')
# post的请求参数算是不会拼接到url的后面,需要放在请求对象定制的参数中
request = urllib.request.Request(url=url,data=data,headers=headers)
# 模拟浏览器向服务器发送清华
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
# print(content,type(content))
# 字符串-->json对象
import json
obj = json.loads(content)
print(obj)
# post的请求方式的参数必须编码,
# 编码之后需要调用encode方法,参数是放在请求对象点固执的方法中
# post案例,百度详细翻译(这个案例好像出不来,需要找下哪个错误)
url = 'https://fanyi.baidu.com/v2transapi?from=en&to=zh'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.41 Safari/537.36 Edg/101.0.1210.32'
}
data = {
# 'Accept': '*/*',
# # 'Accept-Encoding': 'gzip, deflate, br',
# 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
# 'Connection': 'keep-alive',
# 'Content-Length': '135',
# 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie': 'BAIDUID=59557AB2F4DB1AB8145ECBFCF96D7290:FG=1; BIDUPSID=59557AB2F4DB1AB8145ECBFCF96D7290; PSTM=1646571173; BAIDUID_BFESS=10AA94C276A7269FD1E798902C740916:FG=1; MCITY=-340%3A; BDUSS=XVmOEJxeUlDMmd-ZFBFSnhzSDRXOUkteFFIZ0VCUUg5QzY4QzVTTWQ2YVlmNUJpRVFBQUFBJCQAAAAAAAAAAAEAAACIqD57ZGdkdGhoZ2ZmcmhqAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAJjyaGKY8mhiR; BDUSS_BFESS=XVmOEJxeUlDMmd-ZFBFSnhzSDRXOUkteFFIZ0VCUUg5QzY4QzVTTWQ2YVlmNUJpRVFBQUFBJCQAAAAAAAAAAAEAAACIqD57ZGdkdGhoZ2ZmcmhqAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAJjyaGKY8mhiR; BA_HECTOR=2l0180242h808l0h2k1h7acbn0r; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; RT="z=1&dm=baidu.com&si=uomo2s2r8f&ss=l2uje69s&sl=2&tt=2rb&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=4pz&ul=3ivo&hd=3iyf"; ZD_ENTRY=bing; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; delPer=0; PSINO=7; H_PS_PSSID=36309_36367_34812_35911_36165_34584_35979_36074_35802_26350_36311_36061; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1651902529; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1651903118; ab_sr=1.0.1_Mzc1NTU5YjI3YmNkNDM2YjJjMGQxMTRhN2I5YjNkNjBlZWU1NTU5ODZkNTZiNmY3NDQzOTdkM2MwNDEwMjc5Njc3MjM1ODJlNTkyNzhmMjg3ZDU3YzIwYzZjNDE2ZTJjYjQ3NDU1ZDI5ZDVmNDU5YzY2ODk2ZTg3N2M5YjZmZTZkM2I3ZGY0MjZkNjk5MjdlMDQzZTU2OTJkYjExNDI0ODM2N2QyOGRhYmYyNjI1M2U2YjEyYzhlNTA0ODRjNDc0',
# 'Host': 'fanyi.baidu.com',
# 'Origin': 'https://fanyi.baidu.com',
# 'Referer': 'https://fanyi.baidu.com/?aldtype=16047',
# 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="101", "Microsoft Edge";v="101"',
# 'sec-ch-ua-mobile': '?0',
# 'sec-ch-ua-platform': '"Windows"',
# 'Sec-Fetch-Dest': 'empty',
# 'Sec-Fetch-Mode': 'cors',
# 'Sec-Fetch-Site': 'same-origin',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.41 Safari/537.36 Edg/101.0.1210.32',
# 'X-Requested-With': 'XMLHttpRequest'
}
data = urllib.parse.urlencode(data).encode('utf-8')
request = urllib.request.Request(url=url,data=data,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
import json
obj = json.loads(content)
print(obj)
# ajax的get请求
# 获取豆瓣电影第一页的数据(get请求),并且保存起来
url = 'https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=&start=0&genres=%E5%8A%A8%E4%BD%9C'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.41 Safari/537.36 Edg/101.0.1210.32'
}
# 1.请求对象的定制
request = urllib.request.Request(url=url,headers=headers)
# 2. 获取响应数据
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
# 3.下载数据到本地
# open方法默认情况下使用的是gbk的编码,如果想要保存汉字,休要指定encoding为utf-
# fp = open('douban.json','w',encoding='utf-8')
# fp.write(content)
with open('douban1.json','w',encoding='utf-8') as fp:
fp.write(content)
爬取豆瓣电影前10页的数据,发现现在豆瓣的网址改变了,跟教程有所不同,爬取下来的是空数据,已经没有了limit参数,在写笔记的时候还没去调试,后期爬取成功的话会进行正确数据替换。
# 获取豆瓣电影前十页(寻找连接规律)
# start (page-1)*20
# 1.请求对象的定制
# 2.获取相应的数据
# 3.下载数据
def create_request(page):
base_url = 'https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=&start=0&genres=%E5%8A%A8%E4%BD%9C'
data = {
'start':(page - 1 ) * 20,'limit':20
}
data = urllib.parse.urlencode(data)
url = base_url + data
print(url)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.41 Safari/537.36 Edg/101.0.1210.32'
}
request = urllib.request.Request(url=url,headers=headers)
return request
def get_content(request):
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
return content
def down_load(page,content):
with open('douban' + str(page) + '.json','w',encoding='utf-8') as fp:
fp.write(content)
# 程序入口
if __name__ == '__main__':
start_page = int(input('起始的页码:'))
end_page = int(input('结束的页面:'))
for page in range(start_page,end_page+1):
# 每一页都有自己的请求对象定制
request = create_request(page)
# 获取相应的数据
content = get_content(request)
# 下载
down_load(page,content)
这个是关于数据分析的笔记。
# 热成像图 imshow图形化显示矩阵
n = 1000
x,y = np.meshgrid(np.linspace(-3,3,n),np.linspace(-3,3,n))
# print(x,'--x')
# print(y,'--y')
z = (1 - x / 2 + x**5 + y**3) * np.exp(-x**2 - y**2)
mp.imshow(z,cmap='jet',origin='lower')
mp.colorbar()
mp.show()
# 3D图像绘制
# 随机生成3组坐标,成标准正态分布规则, 并且绘制他们
n = 300
x = np.random.normal(0,1,n)
y = np.random.normal(0,1,n)
z = np.random.normal(0,1,n)
# 绘制三维点阵
mp.figure('3D scatter',facecolor='lightgray')
ax3d = mp.gca(projection='3d')
ax3d.set_xlabel('x')
ax3d.set_ylabel('y')
ax3d.set_zlabel('z')
d = x**2 + y**2 + z**2
ax3d.scatter(x,y,z,s=70,marker='o',c=d,cmap='jet')
mp.show()
n = 1000
x,y = np.meshgrid(np.linspace(-3,3,n),np.linspace(-3,3,n))
z = (1 - x / 2 + x**5 + y**3) * np.exp(-x**2 - y**2)
mp.figure('3D surface',facecolor='lightgray')
ax3d = mp.gca(projection='3d')
ax3d.set_xlabel('x')
ax3d.set_ylabel('y')
ax3d.set_zlabel('z')
ax3d.plot_surface(x,y,z,cstride=60,rstride=30,cmap='jet')
mp.show()
n = 1000
x,y = np.meshgrid(np.linspace(-3,3,n),np.linspace(-3,3,n))
z = (1 - x / 2 + x**5 + y**3) * np.exp(-x**2 - y**2)
mp.figure('3D surface',facecolor='lightgray')
ax3d = mp.gca(projection='3d')
ax3d.set_xlabel('x')
ax3d.set_ylabel('y')
ax3d.set_zlabel('z')
ax3d.plot_wireframe(x,y,z,cstride=60,rstride=30,linewidth=1,color='dodgerblue')
mp.show()
# polar极坐标
mp.figure('Polar',facecolor='lightgray')
mp.gca(projection='polar')
mp.title('Polar',fontsize=20)
mp.xlabel(r'$\theta$',fontsize=14)
mp.ylabel(r'$\rho$',fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
# 准备数据
t = np.linspace(0,4 * np.pi,1000)
r = 0.8 * t
mp.plot(t,r)
mp.show()
# 简单动画
# 随机生成各种颜色的100个气泡,让他们不断增大
n = 100
balls = np.zeros(100,dtype=[
('position',float,2),
('size',float,1),
('growth',float,1),
('color',float,4)])
# 初始化balls数组的每个字段的属性值
balls['position'] = np.random.uniform(0,1,(n,2))
# for ball in balls:
# print(ball)
balls['size'] = np.random.uniform(50,70,n)
balls['growth'] = np.random.uniform(10,20,n)
balls['color'] = np.random.uniform(0,1,(n,4))
# 画图
mp.figure('Animation',facecolor='lightgray')
mp.title('Animation',fontsize=16)
mp.xticks([])
mp.yticks([])
sc = mp.scatter(balls['position'][:,0],balls['position'][:,1],
s=balls['size'],color=balls['color'])
def update(number):
# 选择一个点
index = number % 100
# 重新修改index位置元素的属性
balls['position'][index] = np.random.uniform(0,1,(1,2))
balls['size'][index] = np.random.uniform(50,70,1)
balls['size'] += balls['growth']
# 重新绘制界面
sc.set_sizes(balls['size']) # 更新大小
sc.set_offsets(balls['position']) # 更新位置
# 动起来
anim = ma.FuncAnimation(mp.gcf(),update,interval=30)
mp.show()