写入excel部分
写入excel,要连续写入,也就是要在原有excel内容上写入,于是得先读出原先写的地方,然后在后面接着写。
Python 库中xlrd和xlwt比较适合文件的读取和写入,下面是函数。
def write_excel_xls_append(path, value):
index = len(value) # 获取需要写入数据的行数
workbook = xlrd.open_workbook(path) # 打开工作簿
sheets = workbook.sheet_names() # 获取工作簿中的所有表格
worksheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格
rows_old = worksheet.nrows # 获取表格中已存在的数据的行数
new_workbook = copy(workbook) # 将xlrd对象拷贝转化为xlwt对象
new_worksheet = new_workbook.get_sheet(0) # 获取转化后工作簿中的第一个表格
for i in range(0, index):
for j in range(0, len(value[i])):
new_worksheet.write(i+rows_old, j, value[i][j]) # 追加写入数据,注意是从i+rows_old行开始写入
new_workbook.save(path) # 保存工作簿
爬虫部分
爬虫与解析部分:
爬虫核心代码:
requests.get()
然后用BeautifulSoup库解析,下面是完整代码
完整代码
# -*- coding: UTF-8 -*-
import time
import requests
import xlwt
from bs4 import BeautifulSoup
import xlrd
from xlutils.copy import copy
def write_excel_xls_append(path, value):
index = len(value) # 获取需要写入数据的行数
workbook = xlrd.open_workbook(path) # 打开工作簿
sheets = workbook.sheet_names() # 获取工作簿中的所有表格
worksheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格
rows_old = worksheet.nrows # 获取表格中已存在的数据的行数
new_workbook = copy(workbook) # 将xlrd对象拷贝转化为xlwt对象
new_worksheet = new_workbook.get_sheet(0) # 获取转化后工作簿中的第一个表格
for i in range(0, index):
for j in range(0, len(value[i])):
new_worksheet.write(i+rows_old, j, value[i][j]) # 追加写入数据,注意是从i+rows_old行开始写入
new_workbook.save(path) # 保存工作簿
style = xlwt.easyxf('font:height 240, color-index red, bold on;align: wrap on, vert centre, horiz center')
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
'Host': 'movie.douban.com'
}
movie_list = []
director_list = []
time_list = []
star_list = []
for i in range(0, 10): # 修改爬虫页数
link = 'https://movie.douban.com/top250?start=' + str(i * 25)
res = requests.get(link, headers=headers, timeout=10)
soup = BeautifulSoup(res.text, "lxml")
div_list = soup.find_all('div', class_='hd')
div1_list = soup.find_all('div', class_='bd')
div2_list = soup.find_all('div', class_='star')
for each in div_list:
movie = each.a.span.text.strip()
movie_list.append(movie)
for each in div1_list:
info = each.p.text.strip()
if len(info) < 3:
continue
time_start = info.find('20')
if time_start < 0:
time_start = info.find('19')
end = info.find('...')
time_a = info[end + 32:end + 36]
time_list.append(time_a)
end = info.find('主')
director = info[4:end - 3]
director_list.append(director)
# write_excel_xls_append()
w_results = [[]]
for movie_i, director_j in zip(movie_list, director_list):
row_result = []
row_result.append(movie_i)
row_result.append(director_j)
w_results.append(row_result)
# print(w_results)
write_excel_xls_append('E:/PythonProject/API/Project_Pathplan/abc.xls', w_results) # 修改文件位置和文件名
print(movie_list)
print(director_list)
time.sleep(3)
# break