python 网络小说爬取3

最新推荐文章于 2022-04-19 19:35:17 发布

Jxufe渣渣斯

最新推荐文章于 2022-04-19 19:35:17 发布

阅读量1.5k

点赞数

本文链接：https://blog.csdn.net/JxufeCarol/article/details/104395628

版权

/*
网络小说：[secret]
*/
#coding:utf-8
import re
import sys
from bs4 import BeautifulSoup
import urllib.request
import time
import random
proxy_list = [
  {"http":"124.88.67.54:80"},
  {"http":"61.135.217.7:80"},
  {"http":"120.230.63.176:80"},
  {"http":"210.35.205.176:80"}
]
proxy = random.choice(proxy_list)#随机选择一个ip地址
httpproxy_handler = urllib.request.ProxyHandler(proxy)
opener = urllib.request.build_opener(httpproxy_handler)
urllib.request.install_opener(opener)

#定义一个爬取网络小说的函数
def getNovelContent():
    html = urllib.request.urlopen('https://www.luoqiuzw.com/book/94819/',timeout=40)
    data = BeautifulSoup(html , 'html.parser')
    #print(data)
    reg = r'<dd><a href="/book/94819/(.*?)">(.*?)</a></dd>'     #正则表达的匹配
    reg = re.compile(reg)     #可添加可不添加，增加效率
    urls = re.findall(reg,str(data))
    #print(urls)
    index = 1
    start_chapter_num = 7+763 #最后一章加7，直接F5即可(13对应第一章)
    for url in urls:
        if(index<start_chapter_num):
           index = index + 1
           continue
        index = index + 1
        chapter_url = url[0]  #章节的超链接
        chapter_url = "https://www.luoqiuzw.com/book/94819/" + chapter_url
        #print(chapter_url)
        chapter_title = url[1]  #章节的名字
        chapter_html = urllib.request.urlopen(chapter_url,timeout=40).read()   #正文内容源代码
        chapter_html = chapter_html.decode("utf-8")
        #print(chapter_html)
        chapter_reg = r'<div id="content" deep="3"><p>(.*?)</p><br></div><div class="bottem2">'
        chapter_reg = re.compile(chapter_reg,re.S)
        chapter_content = re.findall(chapter_reg,chapter_html)
        #print(chapter_content)
        for content in chapter_content:
                content = content.replace("&nbsp;","")
                content = content.replace("天才一秒记住本站地址：[落秋中文] https://www.luoqiuzw.com/最快更新！无广告！","")
                content = content.replace("<br>","\n")
                content = content.replace("content_detail","")
                content = content.replace("<p>","")
                content = content.replace("</p>","")
                content = content.replace("\r\n\t\t","")
                #print(content)
                f = open('124.txt','a',encoding='utf-8')
                f.write("\n"+chapter_title+"\n\n")
                f.write(content+"\n")
                f.close()
        print(chapter_url)#该章节已经全部写入文件中

if __name__ == '__main__':
    getNovelContent()

/*
网络小说：[secret]
*/
#coding:utf-8
import re
import sys
from bs4 import BeautifulSoup
import urllib.request
from io import BytesIO  #for gzip decode
import gzip  #for gzip decode
import time
import random
headers = ('User-Agent', 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1')
opener = urllib.request.build_opener()
opener.addheaders = {headers}
urllib.request.install_opener(opener)

#定义一个爬取网络小说的函数
def getNovelContent():
        html = urllib.request.urlopen('http://www.vipzw.com/90_90334/',timeout=60).read()
        #gzip decode start
        buff = BytesIO(html)
        fr = gzip.GzipFile(fileobj=buff)
        html = fr.read().decode('utf-8')
        #gzip decode end
        #print(html)
        data = BeautifulSoup(html , "html.parser")
        #print(data)
        reg = r'<dd><a href="/90_90334/(.*?)">(.*?)</a></dd>'     #正则表达的匹配
        reg = re.compile(reg)     #可添加可不添加，增加效率
        urls = re.findall(reg,str(data))
        #print(urls)
        index = 1
        start_chapter_num = 9 + 20 #最后一章加9，直接F5即可(13对应第一章)
        for url in urls:
                if(index<start_chapter_num):
                        index = index + 1
                        continue
                index = index + 1
                chapter_url = url[0]  #章节的超链接
                chapter_url = "http://www.vipzw.com/90_90334/" + chapter_url
                chapter_title = url[1]  #章节的名字
                chapter_html = urllib.request.urlopen(chapter_url,timeout=60).read()   #正文内容源代码
                #gzip decode start
                try:
                        buff2 = BytesIO(chapter_html)
                        fr2 = gzip.GzipFile(fileobj=buff2)
                        chapter_html = fr2.read().decode('utf-8')
                except:
                        chapter_html = chapter_html.decode("utf-8")
                #gzip decode end
                #print(chapter_html)
                chapter_reg = r'<div id="content">(.*?)</div>'
                chapter_reg = re.compile(chapter_reg,re.S)
                chapter_content = re.findall(chapter_reg,chapter_html)
                #print(chapter_content)
                for content in chapter_content:
                        #content = content.replace("\r","")
                        content = content.replace("&nbsp;&nbsp;&nbsp;&nbsp;","    ")
                        content = content.replace("<br />","")
                        content = content.replace("请记住本书首发域名：www.vipzw.com。VIP中文_笔趣阁手机版阅读网址：m.vipzw.com","")
                        content = content.replace("\u3000\u3000","")
                        #print(content)
                        f = open('126.txt','a',encoding='utf-8')
                        f.write("\n"+chapter_title+"\n\n")
                        f.write(content+"\n")
                        f.close()
                fr2.close()
                print(chapter_url)#该章节已经全部写入文件中
        fr.close()
        
if __name__ == '__main__':
    getNovelContent()

以上网络小说爬虫均是帮别人爬取的，在写代码的过程中发现以下问题：
一、代理IP（随机IP爬取内容）
二、gzip抓取网页
三、utf-8编码

一、代理IP问题（随机IP爬取内容）
参考链接：python爬虫-实现多个ip地址访问https://blog.csdn.net/qq_43709494/article/details/93937821
整体思路：我们可以建立一个存放ip地址的列表，包含多个可用的ip地址（大家可以网上搜免费ip地址），然后每次都随机调用一个ip地址，建立http连接。这样就避免了同一个ip地址多次访问被屏蔽的风险。

from urllib import request
import random
proxy_list = [
  {"http":"124.88.67.54:80"},
  {"http":"61.135.217.7:80"},
  {"http":"42.231.165.132:8118"}
]
proxy = random.choice(proxy_list)                    #随机选择一个ip地址
httpproxy_handler = request.ProxyHandler(proxy)
opener = request.build_opener(httpproxy_handler)
request = request.Request("http://www.baidu.com/")
response =opener.open(request)
print(response.read())

二、gzip抓取网页

WARNING:root:Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.

参考链接：python读取gzip格式及普通格式网页的方法https://blog.csdn.net/HelloHaibo/article/details/77624416

一般情况下，我们读取网页分析去返回内容时是这样子的：

#!/usr/bin/python
#coding:utf-8
import urllib2
headers = {"User-Agent": 'Opera/9.25 (Windows NT 5.1; U; en)'}
request = urllib2.Request(url='http://www.baidu.com', headers=headers)
response = urllib2.urlopen(request).read()

一般情况下，你可以看到返回的网页源码：

<html>
<head>
    <meta http-equiv="content-type" content="text/html;charset=utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=Edge">
	<meta content="always" name="referrer">
    <meta name="theme-color" content="#2932e1">
    <link rel="shortcut icon" href="/favicon.ico" type="image/x-icon" />
    <link rel="search" type="application/opensearchdescription+xml" href="/content-search.xml" title="百度搜索" /> 
    <link rel="icon" sizes="any" mask href="//www.baidu.com/img/baidu.svg">
	<link rel="dns-prefetch" href="//s1.bdstatic.com"/>
	<link rel="dns-prefetch" href="//t1.baidu.com"/>
	<link rel="dns-prefetch" href="//t2.baidu.com"/>
	<link rel="dns-prefetch" href="//t3.baidu.com"/>
	<link rel="dns-prefetch" href="//t10.baidu.com"/>
	<link rel="dns-prefetch" href="//t11.baidu.com"/>
	<link rel="dns-prefetch" href="//t12.baidu.com"/>
	<link rel="dns-prefetch" href="//b1.bdstatic.com"/>
    <title>百度一下，你就知道</title>   
<style id="css_index" index="index" type="text/css">html,body{height:100%}
html{overflow-y:auto}

但是有时候访问有些网页时，也会返回乱码，ok，你会首先考虑编码的问题（这不是本文的重点，一笔带过），查看网页的编码（可以参考我的文章-python获取网页编码的方法），然后用它的编码方式方式去decode内容，这样会解决一部分的网页乱码，但是有时候可以肯定不是编码的问题，怎么还是乱码？

当然，我们还有一种情况没有考虑，同时也是我们最容易忽略的一点，返回的网页格式。一般网页返回数据的格式会是text/html和gzip两种，text/html格式的数据是可以直接read的，而gzip格式的数据不能直接read，需要使用专门的gzip模块进行读取，废话不多说，亮代码：

from StringIO import StringIO
import gzip
import urllib2
headers = {"User-Agent": 'Opera/9.25 (Windows NT 5.1; U; en)'}
request = urllib2.Request(url='gzip格式的网页', headers=headers)
response = urllib2.urlopen(request)
buf = StringIO( response.read())
f = gzip.GzipFile(fileobj=buf)
data = f.read()
#处理
.........
f.close()

/*可能会出现的错误*/
OSError: Not a gzipped file (b'<!') 
#没有添加f.close()，或者不需要对网页进行gzip操作

注意：一定要关闭f，即必须要有f.close() ，特别是做爬虫，进行多线程抓取。
当然这还不完美，有的服务器会主动更换返回网页的格式（如果你对text/html格式数据进行gzip模块解读，给你输出的也是一堆乱码），有可能是白天test/html，晚上gzip，也有可能每天轮换（本人亲身经历，一个网站在中午12点后是text/html格式，上午还是gzip格式），所以很理所当然的我们就要想到读到网页后先进行检测内容格式（通过info()里面的Content-Encoding项），在进行相应的处理。最终代码：

#coding:utf-8
from StringIO import StringIO
import gzip
import urllib2
headers = {"User-Agent": 'Opera/9.25 (Windows NT 5.1; U; en)'}
request = urllib2.Request(url='网址', headers=headers)
response = urllib2.urlopen(request)
if response.info().get('Content-Encoding') == 'gzip':
    buf = StringIO( response.read())
    f = gzip.GzipFile(fileobj=buf)
    data = f.read()
    #处理
    f.close()
else:
   data = response .read()

三、utf-8编码
参考链接1：Python3中使用Requests和BeaitfulSoup的编码问题https://www.jianshu.com/p/664483569101
写在前面的话：
学习Python有一段时间了，但是一直没有太多的实战，前期的学习主要是看买的电子书 Python学习手册 (额，刚刚看了一下，这边书的电子书居然已?下架，但是我基本确定我买的就是这个，电子书和实体的特点一样：几乎都是同类书中最贵的，当时真的是买的很心痛啊！)，看了不到30%，发现这本说还是有点门槛的，于是就看了一些了零基础的视频，还有比较经典的廖雪峰的Python3教程, 还有对应的视频教程哦 (重点：记得赞赏作者廖雪峰啊!)。总之一句话：找了非常多资料，看了一些书。然并卵，心里就是没底，那就实战吧！本文就是实践中遇到的第一个比较棘手的问题，以及解决方案。
问题源码：

#!/usr/bin/env python3      # 对Windows无用，可直接在UNIX内核系统运行（Mac OS, Linux...)
# -*- coding: utf-8 -*-     # 告诉调用对象，程序是用UTF-8编码的
import requests             # 导入第三方库
from bs4 import BeautifulSoup
def get_BOC_data(url):      # 封装为函数
    headers = {"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
               "Referer": "http://www.boc.cn/"}     # 模拟浏览器请求头信息
    proxies = {"http": "218.76.106.78:3128"}        # 设置代理IP，不稳定，随时可能失效
    response = requests.get(url, headers=headers, proxies=proxies)     #调用requsts.get()方法
    print(response.status_code, "\n", rsponse.url)     # 打印状态码以及请求的URL
    soup = BeautifulSoup(response.text, "lxml")        # 调用BeautifulSoup,解析网页数据
    print(soup.prettify())    # 格式化打印soup对象
    table_data = soup.select("body div.publish table")[0]     #调用soup对象支持CSS selectors
    for row in table_data.find_all("tr"):     # 直接打印表格数据
        for col in row.find_all("td"):      # 使用soup最常用的find_all()方法
            print(col.string, end="\t")
        print("\n")
if __name__ == "__main__":      # 好像是测试用的，具体原理暂时没去弄很清楚、
    start_url = "http://www.bankofchina.com/sourcedb/ffx/index.html"    #初始目标URL
    get_BOC_data(start_url)     # 调用函数

问题描述：
代码背景说明
我之前呆的公司是做金融数据的，获取数据的方式一般是：爬虫（大量规则的数据）、人工采集（很多数据采集员，采集不规则或少量或非常重要的数据）、买。我开始做的就是数据采集员，非常枯燥，但不断学习，后来就做了质检，最后离开的时候数据策划分析员。由于那段经历，我就想学爬虫，提高效率。本次抓取的数据就是当时自己手动复制粘贴过的数据：中国银行远期结售汇牌价。
在这里插入图片描述

第一个编码是动词 encode, 
第二个编码名称是名词 encoding, 代码表示为 str.encode(encoding=" ")， 
第二句也就是第一句的逆过程，代码表示 bytes.decode(decoding=" "); 
其中str表示字符串对象，bytes表示二进制对象

截取上面的代码

response = requests.get(url, headers=headers, proxies=proxies)
soup = BeautifulSoup(response.text, "lxml")
print(soup.prettify())

在这里插入图片描述
编码和解码的原则：
数据是如何编码(encode)的，最终就需要以相同（或者相互兼容）的方式解码(decode)。为什么最终呢？最终的意思就是可能由于失误或者其他的一些原因导致第一次解码(decode)用的编码(encoding)和编码(encode)时的编码(encoding)不一致,导致乱码出现，如果再以相同的编码完全逆序编码一次，在解码一次，就可以还原。举个例子：

>>> s = "SacrÃ" # 定义变量s,并把字符串 "SacrÃ" 赋值给s (注意：此时这个符号"Ã"，并非是乱码，而是葡萄牙语的一个字母)
>>> s.encode("utf-8") # 以"utf-8"进行编码
b'Sacr\xc3\x83' # 返回二进制（十六进制的形式表示的，字母还是用对应的Ascii字母表示）
>>> s.encode("utf-8").decode("Latin-1")  # 再以"Latin-1"进行解码（"西欧语系的编码"）
'SacrÃ\x83' # 由于编码和解码的方式不一致导致出现乱码
>>> s.encode("utf-8").decode("Latin-1").encode("Latin-1") # 再用"Latin-1"进行编码
b'Sacr\xc3\x83' # 返回二进制
>>> s.encode("utf-8").decode("Latin-1").encode("Latin-1").decode("utf-8")       # 最后再用 "utf-8"进行解码
'SacrÃ' # 完璧归赵！

对编码原则了解后，再来分析数据交换的过程，就能发现问题了：
1.首先爬虫需要知道服务器发送过来的二进制数据使用什么方式编码的，才能有效的解码
1.1.服务器返回的 Headers(头部信息）中一个字段 Content-Type 一般包含数据的编码方式，例如 Content-Type:text/html; charset=UTF-8 这就表明服务器发送的数据已utf-8编码。
1.2.但是现在的问题是：目前有一些不规范的网站返回的头部文件中没有包含编码信息，此时requests就无法解析出编码，然后就调用程序默认的编码方式：“Latin-1”，为什么是它？某个传输协议上这么规定的，开发requests库的程序员就这么写了，也没错，但就是在中国很不实用。这次编码问题就是由这个引起的
2.解决方案
2.1发现乱码后，先通过 print(response.encoding) 查看requests使用什么方式解码的，在调用 response.content,通过正则找到类似这样的字符串：\n，其中 charset=utf-8就表明了数据的编码方式。知道编码方式后通过添加语句： response.encoding = “utf-8” （指定编码方式）。正确解析之后，后面的数据交换过程就不会问题了，为什么呢？因为Requests默认的输出方式是unicode（当然你也可以修改，通过：先编码再解码的方式如： response.text.encode(“gbk”).decode(“gbk”)，但是一般情况下没必要）, BeautifulSoup一般可以有效解析出传入其中的编码方式（如果传入的数据本身是字符串，那就不需要解码，如果是某种二进制编码，如果是你本生就知道的编码方式，你可以在在解析是传入原始编码方式，如： soup = BeautifulSoup(b’\x34\xa4\x3f’, from_encoding=“utf-8”, “lxml”),并默认输出为utf-8编码格式（unicode和utf-8兼容），（默认输出的是utf-8,但是如果你想输出其它格式，你也可直接 soup.decode(),当然这个也是没必要的）。关键在第一步！后面直接默认就行。
2.2或者直接通过浏览器查看网页源码，查看HTML网页的头部<meta…>,就可以看到上面的信息了，标签的内容不多，所以可以很快查看到。
3.讨论一下其他情况（也是在中国比较常见的问题）
3.1.看到博客发现其它人出现这样的情况：服务器发送的原始编码是gbk或者gb2321等非unicode体系编码，通过设置之后requests可以正确解析代码，但是最后，soup 输出的格式依然乱码，我觉得不应该，我自己试了一下，并没有出现乱码。（可能和相关程序的版本有关，我用的是: Win10-32bit, Python3)
3.2.国内情况比较复杂，因此还有一种更奇葩的情况：服务器返回的头部中或者HTML头部中有指名数据编码形式，但是：都是错的！，想想就可怕，但是确实有这种情况，如：趣彩网，服务器返回的头部中没有标明编码，但是你发现HTML头部中有 charset=gb2312 此时非常开心，然后设置编码 response.encoding=“gb2312”, 但是乱码还是如约而至，我就又开始怀疑自己了，于是查资料，发现它提供的编码可能不正确的这种可能性，越是就尝试着设置 `response.encoding=“utf-8”,奇迹出现了，乱码消失！，但是问题是我也是碰运气的啊，可能有能检测的方法，于是查资料发现：有两种方法可以判断原始二进制编码格式

# 第三方库bs4中模块可以再不调用BeatifulSoup的情况下检测二进制编码方式。
from bs4 import UnicodeDammit
dammit = UnicodeDammit(r.content)   #注意：这个需要传入的是二进制原始数据，不能传入字符串
print(dammit.original_encoding)
输出结果为：Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
gb2312      #识别正确

# 专门的编码检测库chardet,好像很牛的样子，我们来试一试：
import chardet
print(chardet.detect(r.content))    # 注意：传入字符串会报错：ValueError: Expected a bytes object, not a unicode object，也就是很上面UnicodeDammit一样，只能传入bytes数据。
输出结果为：{'encoding': 'GB2312', 'confidence': 0.99}          # 真的很棒

总结（再次谢此位大佬！）

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import pymysql
import requests
from bs4 import BeautifulSoup
def conn_to_mysql():    # 通过函数建立数据库连接
    connection = pymysql.connect(host="localhost", user="root", passwd="521513",
                                 db="spider_data", port=3306, charset="UTF8")
    cursor = connection.cursor()
    return connection, cursor
def get_boc_data(url):  # 抓取单页数据，并输出到MySQL
    headers = {"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
               "Referer": "http://www.boc.cn/"}
    proxies = {"http": "218.76.106.78:3128"}
    r = requests.get(url, headers=headers)
    r.encoding = "utf-8"
    print(r.status_code, "\n", r.url)
    soup = BeautifulSoup(r.text, "lxml")
    table_data = soup.select("body div.publish table")[0]
    conn, cur = conn_to_mysql()
    for row in table_data.find_all("tr"):
        boc_data = []
        for col in row.find_all("td"):
            boc_data.append(col.text)
        print(tuple(boc_data))
        if len(boc_data) == 7:
            sql = "INSERT INTO boc_data_2(cur_name, cur_id, tra_date, bid_price, off_price,mid_price, date) " \
                "values('%s', '%s', '%s', '%s', '%s', '%s', '%s')" % tuple(boc_data)
            cur.execute(sql)
        else:
            pass
    conn.commit()
    conn.close()
def get_all_page_data(pages):   # 下载多页数据
    start_url = "http://www.bankofchina.com/sourcedb/ffx/index.html"
    get_boc_data(start_url)
    for page in range(1, pages):
        new_url = "http://www.bankofchina.com/sourcedb/ffx/index_" + str(page) + ".html"
        get_boc_data(new_url)
if __name__ == "__main__":
    get_all_page_data(4)

参考链接2：
Python爬取网页Utf-8解码错误及gzip压缩问题的解决办法
https://zhuanlan.zhihu.com/p/25095566

在我们用python3爬取一些网站时，获取网页url后进行解析，在采用decode(‘utf-8’)解码时有时候会出现utf-8无法解码的问题，比如结果会提示：

Unicode Decode Error: 'utf8' codec can't decode byte 0xb2 in position 0: invalid start byte

'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte

这是因为有些网站进行了gzip压缩，最典型的就是sina，进行网页爬虫经常出现这个问题，那么为什么要压缩呢？搜狗百科解释为：

HTTP协议上的GZIP编码是一种用来改进WEB应用程序性能的技术。大流量的WEB站点常常使用GZIP压缩技术来让用户感受更快的速度。这一般是指WWW服务器中安装的一个功能，当有人来访问这个服务器中的网站时，服务器中的这个功能就将网页内容压缩后传输到来访的电脑浏览器中显示出来.一般对纯文本内容可压缩到原大小的40%.这样传输就快了，效果就是你点击网址后会很快的显示出来.当然这也会增加服务器的负载. 一般服务器中都安装有这个功能模块的。

我们在打开某个新浪网页，点击右键选择“检查”，然后Network》All》Headers下的“Request Headers”中就会发现这个字样：

Accept-Encoding:gzip, deflate, sdch

这个问题有的建议：“看一下设置的header是否存在 ‘Accept-Encoding’:’ gzip, deflate’,这一句话，如果存在，删除即可解决。”，但是有时候header不存在这个代码，怎么删除？如下，我们以打开某个新浪新闻网页为例：

import urllib.request
from bs4 import BeautifulSoup
url=’http://news.sina.com.cn/c/nd/2017-02-05/doc-ifyafcyw0237672.shtml’
req = urllib.request.Request(url)
req.add_header('User-Agent',
               'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0')
page = urllib.request.urlopen(req)  # 模仿浏览器登录
txt = page.read().decode('utf-8')
soup = BeautifulSoup(txt, 'lxml')
title =soup.select('#artibodyTitle')[0].text
print(title)

在run之后仍会出现问题，当把decode(‘utf-8’)去掉后得到的页面是乱码的。因此，解决的办法不是如此。
在这里有两种解决办法：（1）采用gzip库解压网页再解码；（2）使用requests库解析网页而不是urllib。
（1）的解决办法为：在“txt = page.read()”页面读取之后，再加入下面这个命令：

txt=gzip.decompress(txt).decode('utf-8')

（2）的解决办法为：

import requests
import gzip
url="http://news.sina.com.cn/c/nd/2017-02-05/doc-ifyafcyw0237672.shtml"
req = requests.get(url)
req.encoding= 'utf-8'

这是对网页用设置为‘utf-8’的格式，但是这里模拟浏览器登录需采用这种方式：

headers = {
    'Host': 'blog.csdn.net',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
    ….
}

参考链接3：UnicodeDecodeError: ‘utf-8’ codec can’t decode byte 0x8b in position 1: invalid start byte
https://blog.csdn.net/zhang_cl_cn/article/details/94575568

from urllib import request
class Spilder():
	#斗鱼url
    url='https://www.douyu.com/'
    def __fetch_content(self):
        r = request.urlopen(Spilder.url)
        htmls = r.read()    #获取字节码（html）
        print(htmls)
        htmls = str(htmls, encoding='utf-8')   
        print(htmls)
    def go(self):
        self.__fetch_content()
spilder=Spilder()
spilder.go()

首先我们观察第一个print输出的字节码可以看到它是以"b’\x1f\x8b\x08"开头的，说明它是gzip压缩过的数据，这也是报错的原因，所以我们需要对我们接收的字节码进行一个解码操作。修改如下：

    url='https://www.douyu.com/'
    def __fetch_content(self):
        r = request.urlopen(Spilder.url)
        htmls = r.read()
        buff = BytesIO(htmls)
        f = gzip.GzipFile(fileobj=buff)
        htmls = f.read().decode('utf-8')
        print(htmls)

Jxufe渣渣斯

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
python 网络小说爬取3

/*网络小说：[secret]*/#coding:utf-8import reimport sysfrom bs4 import BeautifulSoupimport urllib.requestimport timeimport randomproxy_list = [ {"http":"124.88.67.54:80"}, {"http":"61.135.217....
复制链接

扫一扫