python—简单数据抓取十(抓取网易云音乐信息、破解起点中文网的字体加密)

学习目标:

python学习三十—简单数据抓取十


学习内容:

1、抓取网易云音乐信息
2、破解起点中文网的字体加密


1、抓取网易云音乐信息

# coding = utf-8
from Crypto.Cipher import AES
import base64
import requests
import json

headers = {
'Host': 'music.163.com',
'Origin': 'http://music.163.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
    'Cookie': '_ntes_nuid=f06d3ad37351b5873976195a9494a02d; _ntes_nnid=f102fe8ae1e8dd9ed03607cdefb1c942,1497115499078; usertrack=ZUcIhlmSfomGqgIViqQQAg==; P_INFO="wg03156419@126.com|1502852871|0|unireg|00&99|null&null&null#bej&null#10#0#0|&0||wg03156419@126.com"; _ga=GA1.2.478581340.1502773419; __gads=ID=02cb5469ddd177c4:T=1506065729:S=ALNI_MZ9QlRybfcDfjZVRKwT8W_Bv1CftQ; vjuids=6c007fd66.15ea88cc3cc.0.8971b60c9fb4f; vjlast=1506066351.1506583363.13; vinfo_n_f_l_n3=a2092b13a348e589.1.1.1506066351102.1506066377628.1506583384074; _iuqxldmzr_=32; __utmc=94650624; __utmz=94650624.1521856821.3.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; JSESSIONID-WYYY=u9SvNoV%2FXRomtTlZNB0ZIIe0VwF%2BD40Sx1NZU%5CGoiobfm6b%2FzCATtdRCD29qziIez%2FpTGGY%2Fa6ZegzTxo3app6l%2BiA6diHt%2FScw9vYSWb5VaA1YJW6FBujX3orZC6lgKZK9o21KKTrqfidGtXkHH9Mvav2U8EUfFHY%2F%2Bsw7wJ8OMcNCq%3A1522142142350; __utma=94650624.478581340.1502773419.1522129721.1522141162.7; __utmb=94650624.2.10.1522141162; playerid=67808580',
    'Referer': 'http://music.163.com/'
}

first_param ="{\"ids\":\"[1485808754]\",\"br\":'128000',\"csrf_token\":\"\"}"
second_param = "010001"
third_param = "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7"
forth_param = b"0CoJUm6Qyw8W8jud"



def get_params():
    iv = b"0102030405060708"
    first_key = forth_param
    second_key = 16 * b'F'
    # print(first_param, first_key, iv)
    h_encText = AES_encrypt(first_param, first_key, iv)
    # print(h_encText,second_key, iv)
    h_encText = AES_encrypt(h_encText.decode('utf8'), second_key, iv)
    return h_encText

def get_encSecKey():
    encSecKey = "257348aecb5e556c066de214e531faadd1c55d814f9be95fd06d6bff9f4c7a41f831f6394d5a3fd2e3881736d94a02ca919d952872e7d0a50ebfa1769a7a62d512f5f1ca21aec60bc3819a9c3ffca5eca9a0dba6d6f7249b06f5965ecfff3695b54e1c28f3f624750ed39e7de08fc8493242e26dbc4484a01c76f739e135637c"
    return encSecKey


def AES_encrypt(text, key, iv):
    pad = 16 - len(text) % 16
    print('++++++++++++++++++++++++++++++++++++++++++++',pad)
    print(type(pad * chr(pad)))
    text = text + pad * chr(pad)
    print('---------------------------------------------',chr(pad))
    encryptor = AES.new(key, AES.MODE_CBC, iv)
    encrypt_text = encryptor.encrypt(text.encode('utf8'))
    print(777777777777777777777777777777,encrypt_text)
    encrypt_text = base64.b64encode(encrypt_text)
    print(8888888888888888888888888888888, encrypt_text)
    return encrypt_text


def get_json(url, params, encSecKey):
    data = {
        "params": params.decode('utf8'),
        "encSecKey": encSecKey,
    }
    print(1,params.decode('utf8'))
    print(2,encSecKey)

    print(data)
    response = requests.post(url, headers=headers, data=data)
    return response.content


if __name__ == "__main__":
    url='http://music.163.com/weapi/song/enhance/player/url?csrf_token='
    params = get_params()
    encSecKey = get_encSecKey()
    json_text = get_json(url, params, encSecKey)
    print(json_text.decode('gb18030'))
    print(json.loads(json_text.decode('gb18030'))['data'][0]['url'])
    header={
        'Host': 'm10.music.126.net',
        'Referer': 'http://music.163.com/',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',

    }
    print(json.loads(json_text.decode('gb18030')))
    # source = requests.get(json.loads(json_text.decode('gb18030'))['data'][0]['url'],headers=header).content
    # mp3 = open('hello.mp3','wb')
    # mp3.write(source)
    # mp3.close()

2、破解起点中文网的字体加密

import re
import requests
# 用到的工具包
from fontTools.ttLib import TTFont

url = 'https://book.qidian.com/info/1026302634'
source = requests.get(url).text
print(source)

# 字体加密会有一个woff文件,正则获取到该文件的链接
font_url = re.findall(r"format\('eot'\); src: url\('(.*?)'\) format\('woff'\)", source)[0]
print(font_url)

#将该文件以woff文件名并且是二进制的方式保存到本地
filename = font_url.split('/')[-1]
font_content = requests.get(font_url).content
with open(filename, 'wb') as f:
    f.write(font_content)

#把字体文件读取为Python能理解的对象,并且以xml文件保存
base_font = TTFont(filename)
base_font.saveXML('font.xml')

#自己创建一个映射表
eng_2_num = {
    'period': ".", 'two': '2', 'zero': '0', 'five': '5', 'nine': "9", 'seven': '7', 'one': '1', 'three': '3',
    'six': '6', 'four': '4', 'eight': '8'
}

#获取到xml文件中的cmap标签的字体映射规则,根据该映射规则与自己创建的映射表作数字的映射
map_list = base_font.getBestCmap()
print('映射前:', map_list)
for key in map_list.keys():
    map_list[key] = eng_2_num[map_list[key]]
print('映射后:', map_list)

#获取到网页源码中的加密数字,用进过映射出来的map_list字典,解密出相关数字
pattern = re.compile('</style><span.*?>(.*?)</span>', re.S)
print("pattern:", pattern)
data = pattern.findall(source)
print('data is :', data)
for list in data:
    strs = str(list).split(';')
    strs.pop(-1)
    print('strs is :', strs)
    word_count = ''
    for num in strs:
        value = num[2:]
        print('value is :', value)
        word_count += map_list[int(value)]
    print('wold_count is :', word_count)

输出结果:

......
https://qidian.gtimg.com/qd_anti_spider/BDXwYVSF.woff
映射前: {100183: 'five', 100185: 'eight', 100186: 'zero', 100187: 'four', 100188: 'two', 100189: 'six', 100190: 'seven', 100191: 'period', 100192: 'three', 100193: 'one', 100194: 'nine'}
映射后: {100183: '5', 100185: '8', 100186: '0', 100187: '4', 100188: '2', 100189: '6', 100190: '7', 100191: '.', 100192: '3', 100193: '1', 100194: '9'}
pattern: re.compile('</style><span.*?>(.*?)</span>', re.DOTALL)
data is : ['&#100188;&#100186;&#100191;&#100194;&#100192;', '&#100186;', '&#100193;&#100186;&#100187;&#100188;', '&#100183;&#100194;']
strs is : ['&#100188', '&#100186', '&#100191', '&#100194', '&#100192']
value is : 100188
value is : 100186
value is : 100191
value is : 100194
value is : 100192
wold_count is : 20.93
strs is : ['&#100186']
value is : 100186
wold_count is : 0
strs is : ['&#100193', '&#100186', '&#100187', '&#100188']
value is : 100193
value is : 100186
value is : 100187
value is : 100188
wold_count is : 1042
strs is : ['&#100183', '&#100194']
value is : 100183
value is : 100194
wold_count is : 59
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值