学习目标:
python学习三十—简单数据抓取十
学习内容:
1、抓取网易云音乐信息
2、破解起点中文网的字体加密
1、抓取网易云音乐信息
# coding = utf-8
from Crypto.Cipher import AES
import base64
import requests
import json
headers = {
'Host': 'music.163.com',
'Origin': 'http://music.163.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
'Cookie': '_ntes_nuid=f06d3ad37351b5873976195a9494a02d; _ntes_nnid=f102fe8ae1e8dd9ed03607cdefb1c942,1497115499078; usertrack=ZUcIhlmSfomGqgIViqQQAg==; P_INFO="wg03156419@126.com|1502852871|0|unireg|00&99|null&null&null#bej&null#10#0#0|&0||wg03156419@126.com"; _ga=GA1.2.478581340.1502773419; __gads=ID=02cb5469ddd177c4:T=1506065729:S=ALNI_MZ9QlRybfcDfjZVRKwT8W_Bv1CftQ; vjuids=6c007fd66.15ea88cc3cc.0.8971b60c9fb4f; vjlast=1506066351.1506583363.13; vinfo_n_f_l_n3=a2092b13a348e589.1.1.1506066351102.1506066377628.1506583384074; _iuqxldmzr_=32; __utmc=94650624; __utmz=94650624.1521856821.3.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; JSESSIONID-WYYY=u9SvNoV%2FXRomtTlZNB0ZIIe0VwF%2BD40Sx1NZU%5CGoiobfm6b%2FzCATtdRCD29qziIez%2FpTGGY%2Fa6ZegzTxo3app6l%2BiA6diHt%2FScw9vYSWb5VaA1YJW6FBujX3orZC6lgKZK9o21KKTrqfidGtXkHH9Mvav2U8EUfFHY%2F%2Bsw7wJ8OMcNCq%3A1522142142350; __utma=94650624.478581340.1502773419.1522129721.1522141162.7; __utmb=94650624.2.10.1522141162; playerid=67808580',
'Referer': 'http://music.163.com/'
}
first_param ="{\"ids\":\"[1485808754]\",\"br\":'128000',\"csrf_token\":\"\"}"
second_param = "010001"
third_param = "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7"
forth_param = b"0CoJUm6Qyw8W8jud"
def get_params():
iv = b"0102030405060708"
first_key = forth_param
second_key = 16 * b'F'
# print(first_param, first_key, iv)
h_encText = AES_encrypt(first_param, first_key, iv)
# print(h_encText,second_key, iv)
h_encText = AES_encrypt(h_encText.decode('utf8'), second_key, iv)
return h_encText
def get_encSecKey():
encSecKey = "257348aecb5e556c066de214e531faadd1c55d814f9be95fd06d6bff9f4c7a41f831f6394d5a3fd2e3881736d94a02ca919d952872e7d0a50ebfa1769a7a62d512f5f1ca21aec60bc3819a9c3ffca5eca9a0dba6d6f7249b06f5965ecfff3695b54e1c28f3f624750ed39e7de08fc8493242e26dbc4484a01c76f739e135637c"
return encSecKey
def AES_encrypt(text, key, iv):
pad = 16 - len(text) % 16
print('++++++++++++++++++++++++++++++++++++++++++++',pad)
print(type(pad * chr(pad)))
text = text + pad * chr(pad)
print('---------------------------------------------',chr(pad))
encryptor = AES.new(key, AES.MODE_CBC, iv)
encrypt_text = encryptor.encrypt(text.encode('utf8'))
print(777777777777777777777777777777,encrypt_text)
encrypt_text = base64.b64encode(encrypt_text)
print(8888888888888888888888888888888, encrypt_text)
return encrypt_text
def get_json(url, params, encSecKey):
data = {
"params": params.decode('utf8'),
"encSecKey": encSecKey,
}
print(1,params.decode('utf8'))
print(2,encSecKey)
print(data)
response = requests.post(url, headers=headers, data=data)
return response.content
if __name__ == "__main__":
url='http://music.163.com/weapi/song/enhance/player/url?csrf_token='
params = get_params()
encSecKey = get_encSecKey()
json_text = get_json(url, params, encSecKey)
print(json_text.decode('gb18030'))
print(json.loads(json_text.decode('gb18030'))['data'][0]['url'])
header={
'Host': 'm10.music.126.net',
'Referer': 'http://music.163.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
}
print(json.loads(json_text.decode('gb18030')))
# source = requests.get(json.loads(json_text.decode('gb18030'))['data'][0]['url'],headers=header).content
# mp3 = open('hello.mp3','wb')
# mp3.write(source)
# mp3.close()
2、破解起点中文网的字体加密
import re
import requests
# 用到的工具包
from fontTools.ttLib import TTFont
url = 'https://book.qidian.com/info/1026302634'
source = requests.get(url).text
print(source)
# 字体加密会有一个woff文件,正则获取到该文件的链接
font_url = re.findall(r"format\('eot'\); src: url\('(.*?)'\) format\('woff'\)", source)[0]
print(font_url)
#将该文件以woff文件名并且是二进制的方式保存到本地
filename = font_url.split('/')[-1]
font_content = requests.get(font_url).content
with open(filename, 'wb') as f:
f.write(font_content)
#把字体文件读取为Python能理解的对象,并且以xml文件保存
base_font = TTFont(filename)
base_font.saveXML('font.xml')
#自己创建一个映射表
eng_2_num = {
'period': ".", 'two': '2', 'zero': '0', 'five': '5', 'nine': "9", 'seven': '7', 'one': '1', 'three': '3',
'six': '6', 'four': '4', 'eight': '8'
}
#获取到xml文件中的cmap标签的字体映射规则,根据该映射规则与自己创建的映射表作数字的映射
map_list = base_font.getBestCmap()
print('映射前:', map_list)
for key in map_list.keys():
map_list[key] = eng_2_num[map_list[key]]
print('映射后:', map_list)
#获取到网页源码中的加密数字,用进过映射出来的map_list字典,解密出相关数字
pattern = re.compile('</style><span.*?>(.*?)</span>', re.S)
print("pattern:", pattern)
data = pattern.findall(source)
print('data is :', data)
for list in data:
strs = str(list).split(';')
strs.pop(-1)
print('strs is :', strs)
word_count = ''
for num in strs:
value = num[2:]
print('value is :', value)
word_count += map_list[int(value)]
print('wold_count is :', word_count)
输出结果:
......
https://qidian.gtimg.com/qd_anti_spider/BDXwYVSF.woff
映射前: {100183: 'five', 100185: 'eight', 100186: 'zero', 100187: 'four', 100188: 'two', 100189: 'six', 100190: 'seven', 100191: 'period', 100192: 'three', 100193: 'one', 100194: 'nine'}
映射后: {100183: '5', 100185: '8', 100186: '0', 100187: '4', 100188: '2', 100189: '6', 100190: '7', 100191: '.', 100192: '3', 100193: '1', 100194: '9'}
pattern: re.compile('</style><span.*?>(.*?)</span>', re.DOTALL)
data is : ['𘝜𘝚𘝟𘝢𘝠', '𘝚', '𘝡𘝚𘝛𘝜', '𘝗𘝢']
strs is : ['𘝜', '𘝚', '𘝟', '𘝢', '𘝠']
value is : 100188
value is : 100186
value is : 100191
value is : 100194
value is : 100192
wold_count is : 20.93
strs is : ['𘝚']
value is : 100186
wold_count is : 0
strs is : ['𘝡', '𘝚', '𘝛', '𘝜']
value is : 100193
value is : 100186
value is : 100187
value is : 100188
wold_count is : 1042
strs is : ['𘝗', '𘝢']
value is : 100183
value is : 100194
wold_count is : 59