案例一:酷狗音乐
'''
步骤:
1.先拿到通过搜索的url,分析url包含哪些参数是在变的
2.然后对通过搜索的url发送请求,解析并拿到想要的数据用来连拼接歌曲的url
3.对歌曲的url发送请求,解析并拿到下载歌曲的url
4.拿到下载歌曲的url,进行下载保存到一个文件夹里面 对下载的歌曲命名为:歌名+.mp4
用MD5对signature进行解密
'''
import requests
import time
from hashlib import md5
from urllib import parse
import json
import re
class KugouSongSpider(object):
def __init__(self):
self.url='https://complexsearch.kugou.com/v2/search/song?callback=callback123&keyword={}&page=1&pagesize=30&bitrate=0&isfuzzy=0&tag=em&inputtype=0&platform=WebFilter&userid=0&clientver=2000&iscorrection=1&privilege_filter=0&srcappid=2919&clienttime={}&mid={}&uuid={}&dfid=-&signature={}'
self.headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'cookie': 'kg_mid=ecc98921826e1bfe5f18c7e63a0b8d46; kg_dfid=0hO1xg4HydW64ZSjCx377ZFv; kg_dfid_collect=d41d8cd98f00b204e9800998ecf8427e; Hm_lvt_aedee6983d4cfc62f509129360d6bb3d=1623140158,1623465800,1624347175,1624352681; Hm_lpvt_aedee6983d4cfc62f509129360d6bb3d=1624362070',
'accept-language': 'zh-CN,zh;q=0.9',
'referer': 'https://www.kugou.com/'
}
self.word = input("请您输入想下载歌曲名:")
self.timec = str(time.time()*1000)[:13]
self.two_url = 'https://wwwapi.kugou.com/yy/index.php?r=play/getdata&callback=jQuery191027654644883288304_1624443984782&hash={}&dfid=0hO1xg4HydW64ZSjCx377ZFv&mid=ecc98921826e1bfe5f18c7e63a0b8d46&platid=4&album_id={}&_={}'
self.song_list=[]
self.FileName_list=[]
self.Songer_list=[]
def get_page(self,url):
res = requests.get(url = url,headers=self.headers)
html = res.content.decode('utf-8')
json_ = html.replace('callback123(','').replace(')','')
return json_
def get_sign(self):
sign_text=[
"NVPh5oo715z5DIWAeQlhMDsWXXQV4hwt",
"bitrate=0",
"callback=callback123",
"clienttime={}".format(self.timec),
"clientver=2000",
"dfid=-",
"inputtype=0",
"iscorrection=1",
"isfuzzy=0",
"keyword={}".format(self.word),
"mid={}".format(self.timec),
"page=1",
"pagesize=30",
"platform=WebFilter",
"privilege_filter=0",
"srcappid=2919",
"tag=em",
"userid=0",
"uuid={}".format(self.timec),
"NVPh5oo715z5DIWAeQlhMDsWXXQV4hwt"
]
string = ''
for d in sign_text:
string += d
s = md5()
s.update(string.encode())
sign = s.hexdigest()
return sign
def parse_page(self,html):
python_json = json.loads(html)
json_str = python_json['data']['lists']
s = 0
for li in json_str:
li_dict = {}
li_dict['AlbumID'] = li['AlbumID']
li_dict['FileHash'] = li['FileHash']
li_dict['FileName'] = str(s)+'-------'+li['FileName']
li_dict['SingerName'] = li['SingerName']
s+=1
self.song_list.append(li_dict)
self.FileName_list.append(li_dict['FileName'])
self.Songer_list.append(li_dict['SingerName'])
inf = dict(zip(self.FileName_list,self.Songer_list))
inf = json.dumps(inf,ensure_ascii=False,indent=2)
text = inf.replace('"','').replace(':','-------')
print(text)
number = int(input("请输入对应的编号[0-29]:"))
file_hash = self.song_list[number]['FileHash']
album_id = self.song_list[number]['AlbumID']
self.get_down_page(file_hash,album_id)
def get_down_page(self,file_hash,album_id):
two_url = self.two_url.format(file_hash,album_id,self.timec)
res = requests.get(url=two_url,headers=self.headers)
json_str = res.content.decode('utf-8')
self.parse_two_page(json_str)
def parse_two_page(self,json_str):
play_url = re.compile('jQuery191027654644883288304_1624443984782.*?"play_url":"(.*?)","authors":.*?',re.S)
song_Name = re.compile('jQuery191027654644883288304_1624443984782.*?","song_name":(".*?"),"lyrics":',re.S)
down_url = play_url.findall(json_str)
Name = song_Name.findall(json_str)
for name in Name:
SongName = eval(u'%s'%name)
for u in down_url:
down_play_url = u.replace("\\","")
print(down_play_url)
self.write_down_music(down_play_url,SongName)
print(f"歌曲:{SongName} 已下载并保存成功!!!")
def write_down_music(self,down_play_url,SongName):
filename='./酷狗歌曲/'+SongName+'.mp4'
with open(filename,'wb') as f:
f.write(requests.get(url=down_play_url,headers=self.headers).content)
def main(self):
sign = self.get_sign()
keyword = parse.quote(self.word)
url = self.url.format(keyword,self.timec,self.timec,self.timec,sign)
html = self.get_page(url)
self.parse_page(html)
if __name__=='__main__':
spider = KugouSongSpider()
spider.main()
案例二:有道翻译
'''
步骤:
<1>.找到对应的详情页 拿到url
<2>.分析--->通过post方式发送请求--->拿到data
<3>.找到sign加密的位置,并分析它的加密方式
<4>.然后采用MD5方式对sign进行解密--->得到sign参数
sign: n.md5("fanyideskweb" + e + i + "Tbh5E8=q6U3EXe+&L[4c@")
i表示13位的时间戳+1位(0-9)随机数 e表示请输入需要翻译的内容
'''
from fake_useragent import UserAgent
import requests
import time
from hashlib import md5
import random
def get_data(i):
salt = str(int(time.time()*1000))+str(random.randint(0,9))
ts = str(int(time.time()*1000))
string = "fanyideskweb" + i + salt + "Tbh5E8=q6U3EXe+&L[4c@"
s = md5()
s.update(string.encode())
sign = s.hexdigest()
return salt,ts,sign
def Youdao(i):
salt,ts,sign = get_data(i)
url = 'https://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule'
headers={
'user-agent':UserAgent().random,
'Cookie': 'OUTFOX_SEARCH_USER_ID=115997245@10.108.160.101; OUTFOX_SEARCH_USER_ID_NCOO=303186368.56868434; JSESSIONID=aaaPc_J_dz8H9dKB-xxOx; ___rl__test__cookies=1623889930051',
'Host': 'fanyi.youdao.com',
'Origin': 'https://fanyi.youdao.com',
'Referer': 'https://fanyi.youdao.com/'
}
data={
'i': i,
'from': 'AUTO',
'to': to,
'smartresult': 'dict',
'client': 'fanyideskweb',
'salt': salt,
'sign': sign,
'lts': ts,
'bv': '4f7ca50d9eda878f3f40fb696cce4d6d',
'doctype': 'json',
'version': '2.1',
'keyfrom': 'fanyi.web',
'action': 'FY_BY_REALTlME'
}
res = requests.post(url,headers = headers,data = data).json()
result = res['translateResult'][0][0]['tgt']
return result
if __name__=='__main__':
i = input("请输入翻译的内容:\t")
to = input("请选择您想翻译(英语:en 日语:ja 韩语:ko 中文: ch)的语言\t")
result = Youdao(i)
print("翻译结果是:"+result)
案例三:虎牙直播
from selenium import webdriver
import pymysql
from lxml import etree
class HuYa(object):
def my_sql(self):
my = pymysql.connect(host='127.0.0.1', user='dongxizhi', passwd='dongxizhi', db='dongxizhi')
return my
def Firefox(self,my):
url = 'https://www.huya.com/l'
driver = webdriver.Firefox()
driver.get(url)
item = {}
page = 1
while True:
print("-----" + str(page) + "-----")
page += 1
html = driver.page_source
title = driver.find_elements_by_xpath('//ul[@class="live-list clearfix"]/li/a[2]')
try:
num = driver.find_elements_by_xpath('//ul/li/span/span[3]/i[2]')
name = driver.find_elements_by_xpath('//ul/li/span/span[1]/i')
except:
print("Error!!!")
print(len(name))
for ti, nu, na in zip(title, num, name):
try:
item["直播名"] = ti.text
item["播放量"] = nu.text
item["主播名"] = na.text
sql = "insert into zhibo(broadcast,counts,anchor) values('" + item["直播名"] + "','" + item[
"播放量"] + "','" + \
item["主播名"] + "')"
my.query(sql)
print("写入成功!!!")
except:
print("Encoding error!!!")
if html.find('class="laypage_next"') != -1:
driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div/div/div[4]/div[1]/div/a[8]').click()
else:
break
def main(self):
my = self.my_sql()
self.Firefox(my)
if __name__ == '__main__':
spider = HuYa()
spider.main()
案例四:爬取小说
import requests
import asyncio
import aiohttp
async def download(c_id):
cid = c_id
url = f'https://boxnovel.baidu.com/boxnovel/content?gid=4306063500&data=%7B"fromaction"%3A"dushu"%7D&cid={cid}'
async with aiohttp.ClientSession() as session:
async with session.get(url) as res:
dit = await res.text()
print(dit)
async def get_Search(url):
res = requests.get(url)
json_str = res.json()
tasks = [
]
for item in json_str['data']['chapter']['chapterInfo']:
c_id = item['chapter_id']
tasks.append(download(c_id))
await asyncio.wait(tasks)
if __name__ == '__main__':
b_id = 4306063500
url = f'https://boxnovel.baidu.com/boxnovel/wiseapi/chapterList?bookid={b_id}&pageNum=1&order=asc&site='
asyncio.run(get_Search(url))
案例五:多任务异步爬取数据
import time
import aiohttp
import asyncio
from lxml import etree
urls = [
'https://www.baidu.com/',
'https://www.hao123.com/?src=from_pc'
]
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.41 Mobile Safari/537.36'
}
async def get_request(url):
async with aiohttp.ClientSession() as session:
async with await session.get(url=url) as response:
page_text = await response.text()
return page_text
def parse(t):
page_txt = t.result()
html = etree.HTML(page_txt)
parse_text = html.xpath('//*[@id="aging-total-page"]/text()')
print(parse_text)
if __name__ == '__main__':
start = time.time()
tasks = []
for url in urls:
c = get_request(url)
task = asyncio.ensure_future(c)
task.add_done_callback(parse)
tasks.append(task)
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
print(time.time() - start)