网页压缩技术主要两种gzip和deflate
DEFLATE是一个无专利的压缩算法,它可以实现无损数据压缩,有众多开源的实现算法。
GZIP是使用DEFLATE进行压缩数据的另一个压缩库。
现在普遍支持gzip压缩,Deflate只是一种过时的网页压缩
if __name__ == "__main__":
url='http://www.qq.com/'
req = request.Request(url)
response = request.urlopen(req, timeout=120)
html = response.read()
encoding = response.info().get('Content-Encoding')
print(encoding)
if encoding == 'gzip':
html = zlib.decompress(html, 16+zlib.MAX_WBITS)
elif encoding == 'deflate':
try:
html = zlib.decompress(html, -zlib.MAX_WBITS)
except zlib.error:
html = zlib.decompress(html)
charset = chardet.detect(html)["encoding"]
print(charset)
#print(html)
print(html.decode(charset,'ignore'))
import urllib.request
import zlib
loginUrl = 'https://api.nfapp.southcn.com/nanfang_if/getArticleContent?articleId=2055802&colID=1207&location=%E5%B9%BF%E5%B7%9E'
headers = {
'Accept-Encoding': 'gzip',
'User-Agent': 'okhttp/3.11.0',
'Content-Length': '97',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'androidId': 'NzgwZTJjNTAyM2MxMzViNQ==',
'bluetooth': '',
'brand': 'dGVuY2VudA==',
'channel': 'eWluZ3lvbmdiYW8=',
'deviceId': 'ZmZmZmZmZmYtZDgzZS05ZWVkLWZmZmYtZmZmZmVmMDVhYzRh',
'imei': 'NjY2NjYwMDgyMzMzOTg2',
'latitude': 'MC4w',
'longitude': 'MC4w',
'mac': 'MDg6MDA6Mjc6QzQ6NDY6QzA=',
'manufacturer': 'VGVuY2VudA==',
'model': 'dmlydHVhbG1hY2hpbmUy',
'networkType': 'V2lGaQ==',
'operator':'',
'os': 'YW5kcm9pZA==',
'osVersion': 'MTk=',
'screen': 'NzIweDEyODA=',
'version': 'NS4yLjU=',
'versionCode': 'NTI1MA==',
'Connection': 'close',
'Host': 'api.nfapp.southcn.com',
}
loginData = 'id=2055802&userID=0&siteID=1&userOtherID=ffffffff-d83e-9eed-ffff-ffffef05ac4a&eventType=0&type=0&'.encode('UTF-8')
request = urllib.request.Request(loginUrl, loginData, headers)
res = urllib.request.urlopen(request)
html = zlib.decompress(res.read(), 16+zlib.MAX_WBITS)
data = html.decode('UTF-8','ignore')
print(data)