一开始用的保存函数是将原来的文件内容替换掉,所以换了一种方法就可以追加数据内容了
两种方法的对比
1.追加数据的函数
old_file = xlrd.open_workbook('qishu.xls')
new_file = copy(old_file)
new_sheet = new_file.get_sheet(0)
row = k # 已存在文件中的数据行数
for data in datas:
for j in range(len(data)):
new_sheet.write(row, j, data[j])
row += 1
new_file.save('qishu.xls')
print('写入成功')
2.替换数据的函数
def data_write( qishu, datas): f = xlwt.Workbook() sheet1 = f.add_sheet(u'sheet1', cell_overwrite_ok=True) # 创建sheet # 将数据写入第 i 行,第 j 列 for data in datas: for j in range(len(data)): sheet1.write(i, j, data[j]) i = i + 1 f.save(qishu) # 保存文件 print('写入成功')
from urllib.error import HTTPError
from xlutils.copy import copy
import re, pymongo, requests, xlrd, xlwt
class DataTool(object):
pattern_n = re.compile(r'\n', re.S)
pattern_r = re.compile(r' ', re.S)
pattern_br = re.compile(r' ', re.S)
pattern_b = re.compile(r' ', re.S)
def process_tuple_data(self, origin_tuple_data):
content = re.sub(self.pattern_n, '', origin_tuple_data[4])
content = re.sub(self.pattern_r, '', content)
content = re.sub(self.pattern_br, '', content)
content = re.sub(self.pattern_b, '', content)
data = (origin_tuple_data[0], origin_tuple_data[1], origin_tuple_data[2], origin_tuple_data[3], content,
origin_tuple_data[5])
return data
class QiShuSpider(object):
client = pymongo.MongoClient('localhost')
db = client['dbmovie']
def __init__(self):
self.headers = {
'User-Agent': '*****************************************'
}
self.base_url = 'https://www.qisuu.la/soft/sort01/'
self.tool = DataTool()
def get_total_page_num(self):
"""
获取搜索结果的总页数
:return:
"""
response = requests.get(self.base_url, headers=self.headers)
html = response.text
total_num_pattern = re.compile(r'<div class="tspage".*?页次:1/(.*?) ', re.S)
total_num = int(re.findall(total_num_pattern, html)[0])
return total_num
def get_list_html(self, page, page_num):
list_url = 'https://www.qisuu.la/soft/sort0{}/index_{}.html'.format(page, page_num)
try:
response = requests.get(list_url, headers=self.headers)
html = response.text
except HTTPError as e:
print('列表页异常:url={}, error={}'.format(list_url, e))
return None, None
else:
return html
def parse_list_html(self, html):
if html:
pattern = re.compile(
r'<div class="s">(.*?)<br.*?>(.*?)<br>.*?<em class="lstar3">.*?<br>(.*?)</div>.*?<img .*?>(.*?)</a>.*?<div class="u">(.*?)</div>.*?<a.*?>(.*?)</a>',
re.S)
detail_urls = re.findall(pattern, html)
for detail_url in detail_urls:
new_detail = self.tool.process_tuple_data(detail_url)
print(new_detail)
return detail_urls
else:
print('html源代码为None')
return None
def data_write(self, k, qishu, datas):
# 注意一定要先创建.xls文件,而且因为版本问题不能是xlsx文件
old_file = xlrd.open_workbook('qishu.xls')
new_file = copy(old_file)
new_sheet = new_file.get_sheet(0)
row = k # 已存在文件中的数据行数
for data in datas:
for j in range(len(data)):
new_sheet.write(row, j, data[j])
row += 1
new_file.save('qishu.xls')
print('写入成功')
def start_spider(self, i, x, num):
print('正在请求第{}页'.format(num))
list_html = self.get_list_html(x, num)
if list_html:
detail_urls = self.parse_list_html(list_html)
self.data_write(i, 'qishu.xlsx', detail_urls)
if __name__ == '__main__':
obj = QiShuSpider()
page = obj.get_total_page_num()
i = 0
for x in range(1, 12):
for y in range(1, page):
obj.start_spider(i, x, y)
i = i + 15
运行结果: