最近再写一个爬虫程序,读取请求地址时用到了html=res.read().decode(“utf-8”)里的decode方法,但是一直报错如下:
UnicodeEncodeError: ‘gbk’ codec can’t encode character ‘\u10e6’ in position 181: illegal multibyte sequence
然后自己查询资料,发现需要更改decode方法为decode(“gbk”,“ignore”),然后就解决了,直接上代码如下:
from urllib import request
import csv
import re
import random
import time
class NickName(object):
def __init__(self):
self.url='https://yimanwu.com/nansheng/list_64_{}.html'
self.headers={
'User-Agent': 'Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50'
}
#添加计数变量;
self.page=1
def get_page(self,url):
#1.拼接请求地址
req=request.Request(
url,
headers=self.headers
)
#2.发送其请求
res=request.urlopen(req)
#3.读取请求地址;
# html=res.read().decode("utf-8")
html = res.read().decode("gbk","ignore")
#4.直接调用解析函数,进行解析;
self.parse_page(html)
def parse_page(self,html):
#1.利用正则表达式获取页面信息;
pattern=re.compile(r'<li><p>(.*?)</p></li>',re.S)
print(pattern)
result1=pattern.findall(html)
print(result1)
self.write_csv(result1)
#写入csv文件;
def write_csv(self,film_list):
with open('weixinspider.csv','a+',encoding='utf-8') as f:
for i in range(0,len(film_list)):
f.writelines(film_list[i]+'\n')
f.close()
def main(self):
for offset in range(1,385):
url=self.url.format(str(offset))
print(url)
self.get_page(url)
print("第%d页爬取完成 " %self.page)
self.page+=1
time.sleep(random.randint(1,3))
if __name__=="__main__":
NickName().main()