最近一直在爬晋江的小说列表,比如作者和字数什么的,但是总是出现乱码,网上找了很多资料,然后得到的解决方法,应该可以适用很多网站了。
import urllib
from bs4 import BeautifulSoup
from io import BytesIO
import urllib.request
from urllib.request import urlopen
import zlib
import gzip
import re
from gzip import GzipFile
def loadData(url):
request = urllib.request.Request(url)
request.add_header('Accept-encoding', 'gzip,deflate')
response = urlopen(request)
content = response.read()
encoding = response.info().get('Content-Encoding')
if encoding == 'gzip':
content = gzip1(content)
elif encoding == 'deflate':
content = deflate(content)
elif encoding == 'br':
content == br(content)
return content
def gzip1(data):
buf = BytesIO(data)
f = gzip.GzipFile(fileobj=buf)
return f.read()
def deflate(data):
try:
return zlib.decompress(data, -zlib.MAX_WBITS)
except zlib.error:
return zlib.decompress(data)
import brotli
def br(data):
data = brotli.decompress(data)
def main():
url = "https://www.jjwxc.net/topten.php?orderstr=7&t=0"
content = loadData(url)
html = content.decode("gb18030")
print(html)
if __name__ == '__main__':
main()