前言:
本篇记录了自己敲过的几种不同类型(文字、图片、音乐等)的获取信息的文章,因为这种文章不适合讲太多话,所以我就直接放源代码与最终效果展示图片,有需要的自取。
一、最基础的进行伪装并保存
import requests
#自定义请求信息,进行user-agent伪装
word = input('请输入您想要查询的内容')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.47.102 Safari/537.36'
}
#目标url
url = ''
#对url中参数处理
param = {
'wd':word
}
#带着伪装好的信息发起请求
response = requests.get(url=url,params=param,headers=headers)
#对下载内容进行中文处理
response.encoding = response.apparent_encoding
#将下载好的text文档赋值给data
data = response.text
#持久化存储,对下载好的内容进行存储规定
name = word + '.html'
with open(name,'w',encoding='utf-8')as f:
f.write(data)
print('okk')
二、获取文字信息并进行排名
import re
import csv
import requests
#定义UA伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.48.102 Safari/537.36'
}
#2.指定url
url = ''
#3.发起请求
response = requests.get(url=url,headers=headers)
#4.测试一下响应是否成功
data = response.text
# print(data)
#开始进行数据解析
obj = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<name>.*?)</span>'
r'.*?<p class="">.*?<br>(?P<year>.*?) '
r'.*?<span class="rating_num" property="v:average">(?P<score>.*?)</span>'
r'.*?<span>(?P<renshu>.*?)人评价</span>',re.S)
result = obj.finditer(data)
f = open('./dy.csv','w',encoding='utf-8')
csvwriter = csv.writer(f)
for i in result:
print(i.group('name'))
print(i.group('year'))
print(i.group('score'))
print(i.group('renshu'))
dic = i.groupdict()
csvwriter.writerow(dic.values())
print('成功')
效果图片:
三、获取图片信息
import requests
from lxml import etree
#1.进行ua伪装
headers = {
'user-agent': 'Mozilla/5.0 (Winds NT 10.0; Win64; x64) AppleWebKit/5376 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
}
#2.指定url
url = ''
#发起请求
resposne = requests.get(url=url,headers=headers)
#获取源码后处理乱码
resposne.encoding='gbk'
data = resposne.text
# print(data)
#开始数据解析
tree = etree.HTML(data)
li_list = tree.xpath('/html/body/div[2]/div[1]/div[3]/ul/li')
for li in li_list:
name = li.xpath('./a/b/text()')[0] + '.jpg'
href = 'https://pic.netbian.com/' + li.xpath('./a/img/@src')[0]
img_response = requests.get(url=href,headers=headers)
img_data = img_response.content
# print(img_data)
#持久化存储
img_path = './tupian/' + name
with open(img_path,'wb') as f:
f.write(img_data)
print(name + '下载成功')
最终效果(图片违规放不出来,给大家看一下目录吧):
四、获取多章节文字信息
import requests
from lxml import etree
#1.进行ua伪装
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/5.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safi/537.36'
}
#2.指定url
url = ''
#发起请求
resposne = requests.get(url=url,headers=headers)
#获取源码后处理乱码
resposne.encoding='gbk'
data = resposne.text
# print(data)
#开始数据解析
tree = etree.HTML(data)
li_list = tree.xpath('/html/body/div[2]/div[1]/div[3]/ul/li')
for li in li_list:
name = li.xpath('./a/b/text()')[0] + '.jpg'
href = 'https://pic.netbian.com/' + li.xpath('./a/img/@src')[0]
img_response = requests.get(url=href,headers=headers)
img_data = img_response.content
# print(img_data)
#持久化存储
img_path = './tupian/' + name
with open(img_path,'wb') as f:
f.write(img_data)
print(name + '下载成功')
最终效果:
五、自动联动识别
import requests
from lxml import etree
from hashlib import md5
def get_code(un,pw,id):
class Chaojiying_Client(object):
def __init__(self, username, password, soft_id):
self.username = username
password = password.encode('utf8')
self.password = md5(password).hexdigest()
self.soft_id = soft_id
self.base_params = {
'user': self.username,
'pass2': self.password,
'softid': self.soft_id,
}
self.headers = {
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
}
def PostPic(self, im, codetype):
"""
im: 图片字节
codetype: 题目类型 参考 http://www.chaojiying.com/price.html
"""
params = {
'codetype': codetype,
}
params.update(self.base_params)
files = {'userfile': ('ccc.jpg', im)}
r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
return r.json()
def ReportError(self, im_id):
"""
im_id:报错题目的图片ID
"""
params = {
'id': im_id,
}
params.update(self.base_params)
r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
return r.json()
if __name__ == '__main__':
chaojiying = Chaojiying_Client(un, pw, id) #用户中心>>软件ID 生成一个替换 96001
im = open('a.jpg', 'rb').read() #本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
# print(chaojiying.PostPic(im, 1902))
return chaojiying.PostPic(im,1902)
url =''
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWeb6 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
}
response = requests.get(url=url,headers=headers)
data = response.text
tree = etree.HTML(data)
img = 'https://so.gushiwen.cn' + tree.xpath('//*[@id="imgCode"]/@src')[0]
# img = 'https://so.gushiwen.cn' + tree.xpath('//*[@id="imgCode"]/@src')[0]
r = requests.get(img,headers)
d = r.content
with open('./a.jpg','wb')as f:
f.write(d)
result = get_code('textzhanghu', 'qin729700', '930758')
print(result['pic_str'])
效果图片:
六、获取音乐信息
import os
from lxml import etree
import requests
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKi6 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
}
#把井号删了
url =''
response = requests.get(url=url,headers=headers)
data = response.text
# print(data)
tree = etree.HTML(data)
#对源码进行xpath,无杂质
tr_list = tree.xpath('//ul[@class="f-hide"]/li')
for tr in tr_list:
name = tr.xpath('./a/text()')[0]
href = tr.xpath('./a/@href')[0]
#通过下标取值
# id = href[9:]
#通过分片取值
id = href.split('=')[1]
download = 'https://music.163.com/song/media/outer/url?id='+id
# print(download)
# print(id)
# print(name)
# print(href)
#contains方法,但是有杂质
# tr_list = tree.xpath('//a[contains(@href,"/song?")]')
# for tr in tr_list:
# name = tr.xpath('./text()')
# href = tr.xpath('./@href')[0]
# print(href)
r = requests.get(download,headers)
d = r.content
with open('./music/%s.mp3'%name,'wb')as f:
f.write(d)
print('%s 下载成功'%name)
最终效果:
七、获取音乐附带信息
import requests
import json
import re
url = ''
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWe.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
}
data = {
'params':'NiHz5BCMsR5TXmlrbME6QMJhzWT3diBLZj3iAUWAJ+hccTjMLGjMp95hqS1wVKP9eqG/+JpTNpGaTMI3OMhBPB5nKxTeYTIWjyYFF3uy19UUScojX3GIZb2r9+Z1Chvc',
'encSecKey':'df5b6563fd7b802e9189f9fb204da42eabc5f182bb91072013be7f489014e1e7cb116f546ee8ae325f6d419ba984be3a5cb7fd6ec7afc9582584e77f8345e65c6f36f7bc6b0e66adc26b4bf0dee4529a4f72596f19546895d50f1ace61bcb430da30cc2188f8b278eac02828f4776de5b844d0df3378c37b929fb8d32179ec4c'
}
r = requests.post(url,data,headers)
# r.encoding='utf-8'
d = r.text
#不可使用
# print(d['lrc']['lyric'])
#用json将数据转换为字典
obj = json.loads(d)
lrc = obj['lrc']['lyric']
pattern = re.compile(r'\[.*\]')
lrc1 = re.sub(pattern,'',lrc)
# print(lrc1)
with open('./歌词.txt','w')as f:
f.write(lrc1)
print('下载成功')
最终效果: