(一)安装xlsxwriter、xlrd模块
首先打开cmd,通过下列指令安装XlsxWriter(注意拼写)模块和xlrd模块
pip install Xlsxwriter
pip install xlrd
如果cmd下pip命令无法使用,则应在系统环境变量中添加python安装路径下的Script文件夹。
一台电脑若同时安装python 3.6.x 和 2.7.x版本,lib将会安装在3.6.x版本下。安装成功后,在Pycharm下的Settings - Project Interpreter可查看到安装的模块。
具体使用信息可参见https://xlsxwriter.readthedocs.io/getting_started.html
(二)爬取出版社信息(使用3.6.x版本)
import urllib.request import re import xlsxwriter import xlrd import os def build_xls(name): i=1 if os.path.exists(target_dir+name+'.xlsx'): while os.path.exists(target_dir+name+'(%s).xlsx' % i): i+=1 path=target_dir+name+'(%s).xlsx'% i else: path=target_dir+name+'.xlsx' return xlsxwriter.Workbook(path)
def build_sheet(f,name): return f.add_worksheet(name)
def open_xls(dir): try: return xlrd.open_workbook(dir) except Exception as err: print(err) target_dir=r'c:/backup/' data=urllib.request.urlopen("https://read.douban.com/provider/all").read().decode("utf-8")
#注意需要decode为utf-8格式,否则无法使用re模块进行正则表达式匹配
#print(len(data)) pat='<div class="name">(.*?)</div>' ans=re.compile(pat).findall(data) f=build_xls('book') worksheet=build_sheet(f,'publisher') worksheet.write(0,0,r"出版社信息",f.add_format({'bold':1})) row=1 for i in range(len(ans)): worksheet.write(row,0,ans[i]) row+=1 f.close()
(三)爬取出版社信息和作品数量,并存入不同的选项卡
import urllib.request import re import xlsxwriter import xlrd import os def build_xls(name): i=1 if os.path.exists(target_dir+name+'.xlsx'): while os.path.exists(target_dir+name+'(%s).xlsx' % i): i+=1 path=target_dir+name+'(%s).xlsx'% i else: path=target_dir+name+'.xlsx' return xlsxwriter.Workbook(path) def build_sheet(f,name): return f.add_worksheet(name) def open_xls(dir): try: return xlrd.open_workbook(dir) except Exception as err: print(err) target_dir=r'c:/backup/' data=urllib.request.urlopen("https://read.douban.com/provider/all").read().decode("utf-8") #print(len(data)) pat='<div class="name">(.*?)</div>' pat2='<div class="works-num">.*?(\d+).*?</div>' ans=re.compile(pat).findall(data) ans2=re.compile(pat2).findall(data) f=build_xls('book') worksheet=build_sheet(f,'publisher') worksheet.write(0,0,r"出版社信息",f.add_format({'bold':1})) row=1 for i in range(len(ans)): worksheet.write(row,0,ans[i]) row+=1 worksheet=build_sheet(f,'publisher&name') worksheet.write(0,0,r"出版社信息",f.add_format({'bold':1})) worksheet.write(0,1,r"作品数量",f.add_format({'bold':1})) row=1 for i in range(len(ans2)): worksheet.write(row,0,ans[i]) worksheet.write(row,1,ans2[i]) row+=1 f.close()