Python3.5 ‘wb’与‘w’区别以及写入excel的常见错误

Python3.5 ‘wb’与‘w’区别以及写入excel的常见错误

望共同进步

转载请注明地址:http://blog.csdn.net/weixin_39701039/article/details/79576549

"r"   以读方式打开,只能读文件 , 如果文件不存在,会发生异常      

"w" 以写方式打开,只能写文件, 如果文件不存在,创建该文件;如果文件已存在,先清空,再打开文件                                                
"rb"   以二进制读方式打开,只能读文件 , 如果文件不存在,会发生异常      

"wb" 以二进制写方式打开,只能写文件, 如果文件不存在,创建该文件;如果文件已存在,先清空,再打开文件

这里结合前面写的 Python3.5 爬虫之由浅入深(三、html转excel)来看看'w'和'wb'的区别,已经延伸的说说爬取文件成伪excel时遇到的问题;

一:UnicodeEncodeError: 'gbk' codec can't encode character '\xa0' in position 13785: illegal multibyte sequence  

#coding:utf-8
#python3.5.1

import re
import requests
import time
import os
from bs4 import BeautifulSoup


path = r'G:\任务20180312'
url = 'http://tjj.suqian.gov.cn/stjj/ndsj/201609/d9bbdb1109cf497e80e59c56ce216ce0.shtml'
url_prefix = 'http://tjj.suqian.gov.cn/'

headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }

def get_Soup(url):
    response = requests.get(url,headers = headers,timeout = 120)
    response.encoding = 'utf-8'
    res = response.text
    soup = BeautifulSoup(res,'html.parser')
    return soup
soup = get_Soup(url)
#print(soup)
table = soup('table',style="WIDTH: 542pt; BORDER-COLLAPSE: collapse")[0] #table类型为<class 'bs4.element.Tag'>
print(type(table))      #转变为字符串
result1 = str(table)
#print(result1)

tbody = soup('tbody')[0]   #tbody类型为<class 'bs4.element.Tag'>
result2 = str(tbody)     #转变为字符串

with open(r'G:\任务20180312\test\html_excel/test1.xls','w') as f1:
    f1.write(result1)

#结果:



因为如果在window下运行,对于Unicode字符,需要print出来的话,由于本地系统是Windows中的cmd,默认codepage是CP936,即GBK的编码,所以python解释器需要先将上述的Unicode字符编码为GBK,然后再在cmd中显示出来。但是由于该Unicode字符串中包含一些GBK中无法显示的字符,导致此时提示“’gbk’ codec can’t encode”的错误的。

这个时候我们可以在with open(..,)括号里加入编码方式,'utf-8',如下代码:

#coding:utf-8
#python3.5.1

import re
import requests
import time
import os
from bs4 import BeautifulSoup


path = r'G:\任务20180312'
url = 'http://tjj.suqian.gov.cn/stjj/ndsj/201609/d9bbdb1109cf497e80e59c56ce216ce0.shtml'
url_prefix = 'http://tjj.suqian.gov.cn/'

headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }

def get_Soup(url):
    response = requests.get(url,headers = headers,timeout = 120)
    response.encoding = 'utf-8'
    res = response.text
    soup = BeautifulSoup(res,'html.parser')
    return soup
soup = get_Soup(url)
#print(soup)
table = soup('table',style="WIDTH: 542pt; BORDER-COLLAPSE: collapse")[0] #table类型为<class 'bs4.element.Tag'>
print(type(table))      #转变为字符串
result1 = str(table)
#print(result1)

tbody = soup('tbody')[0]   #tbody类型为<class 'bs4.element.Tag'>
result2 = str(tbody)     #转变为字符串


with open(r'G:\任务20180312\test\html_excel/test1.xls','w',encoding='utf-8') as f1:
    f1.write(result1)

#结果:



二:ValueError: binary mode doesn't take an encoding argument

如下代码:

#coding:utf-8
#python3.5.1

import re
import requests
import time
import os
from bs4 import BeautifulSoup


path = r'G:\任务20180312'
url = 'http://tjj.suqian.gov.cn/stjj/ndsj/201609/d9bbdb1109cf497e80e59c56ce216ce0.shtml'
url_prefix = 'http://tjj.suqian.gov.cn/'

headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }

def get_Soup(url):
    response = requests.get(url,headers = headers,timeout = 120)
    response.encoding = 'utf-8'
    res = response.text
    soup = BeautifulSoup(res,'html.parser')
    return soup
soup = get_Soup(url)
#print(soup)
table = soup('table',style="WIDTH: 542pt; BORDER-COLLAPSE: collapse")[0] #table类型为<class 'bs4.element.Tag'>
print(type(table))      #转变为字符串
result1 = str(table)
#print(result1)

tbody = soup('tbody')[0]   #tbody类型为<class 'bs4.element.Tag'>
result2 = str(tbody)     #转变为字符串


with open(r'G:\任务20180312\test\html_excel/test1.xls','wb',encoding='utf-8') as f1:
    f1.write(result1)

#结果:


因为'wb'是以二进制写入文件,而result1是字符串(str),所以报错,写入文件为0kb,即没有结果

这里可以把result1转变问字节串 bytes(result1)

如下:

#coding:utf-8
#python3.5.1

import re
import requests
import time
import os
from bs4 import BeautifulSoup


path = r'G:\任务20180312'
url = 'http://tjj.suqian.gov.cn/stjj/ndsj/201609/d9bbdb1109cf497e80e59c56ce216ce0.shtml'
url_prefix = 'http://tjj.suqian.gov.cn/'

headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }

def get_Soup(url):
    response = requests.get(url,headers = headers,timeout = 120)
    response.encoding = 'utf-8'
    res = response.text
    soup = BeautifulSoup(res,'html.parser')
    return soup
soup = get_Soup(url)
#print(soup)
table = soup('table',style="WIDTH: 542pt; BORDER-COLLAPSE: collapse")[0] #table类型为<class 'bs4.element.Tag'>
print(type(table))      #转变为字符串
result1 = str(table)
#print(result1)

tbody = soup('tbody')[0]   #tbody类型为<class 'bs4.element.Tag'>
result2 = str(tbody)     #转变为字符串



with open(r'G:\任务20180312\test\html_excel/test1.xls','wb') as f1:
    f1.write(bytes(result1,encoding='utf-8'))

#结果:



注意,这里with open(..)括号里没有encoding=部分了,因为二进制不能在进行编码了,不然会报错ValueError: binary mode doesn't take an encoding argument

bytes(result1,encoding='utf-8')这里是因为转字符串为二进制需要编码方式


三:得到的文件不是我们想要的表格形式,而是一堆字符串

#coding:utf-8
#python3.5.1

import re
import requests
import time
import os
from bs4 import BeautifulSoup


path = r'G:\任务20180312'
url = 'http://tjj.suqian.gov.cn/stjj/ndsj/201609/d9bbdb1109cf497e80e59c56ce216ce0.shtml'
url_prefix = 'http://tjj.suqian.gov.cn/'

headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }

def get_Soup(url):
    response = requests.get(url,headers = headers,timeout = 120)
    response.encoding = 'utf-8'
    res = response.text
    soup = BeautifulSoup(res,'html.parser')
    return soup
soup = get_Soup(url)
#print(soup)
table = soup('table',style="WIDTH: 542pt; BORDER-COLLAPSE: collapse")[0] #table类型为<class 'bs4.element.Tag'>
print(type(table))      #转变为字符串
result1 = str(table)
#print(result1)

tbody = soup('tbody')[0]   #tbody类型为<class 'bs4.element.Tag'>
result2 = str(tbody)     #转变为字符串

with open(r'G:\任务20180312\test\html_excel/test1.xls','w',encoding='utf-8') as f1:
    f1.write(result1)

with open(r'G:\任务20180312\test\html_excel/test2.xls', 'w',encoding='utf-8') as f2:
    f2.write(result2)
 

#结果:



那现在我们发现区别在于result1和result2

右键网页打开源代码,来查看区别:


区别在于result1比result2少了些代码(因为我们存入的文件形式为伪excel,所以这个是有关系的),现在有html工具(这里我用的editplus)分别将这两部分代码以浏览器形式打开:




PS:所以我们要把带样式的代码也抓取下来,建议可以看看html5和css,了解一下

望有所帮助,望采纳!!

  • 2
    点赞
  • 16
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值