爬虫后将数据写入excel的两种方式:
第一种是用xlwt模块,这个写的代码比较多,也比较繁琐,但是一点问题都没有的。
第一种方式源代码
# !/user/bin/env ptyhon
import requests
import re
import xlwt
def main():
workbook = xlwt.Workbook(encoding='utf-8')
mysheet = workbook.add_sheet('mysheet')
mysheet.write(0, 0, '排名')
mysheet.write(0, 1, '图片地址')
mysheet.write(0, 2, '书名')
mysheet.write(0, 3, '评论数量')
mysheet.write(0, 4, '推荐指数')
mysheet.write(0, 5, '作者')
mysheet.write(0, 6, '出版社')
mysheet.write(0, 7, '价格')
i = 1
for page in range(1, 4):
url = 'http://bang.dangdang.com/books/bestsellers/1-' + str(page)
html = getHTMLText(url)
items = parse_result(html)
print(items)
for item in items:
No = item[0] # 排名
image = item[1] # 图片地址
book_title = item[2] # 书名
reviews = item[3] # 评论数量
recommend = item[4] # 推荐指数
writer = item[5] # 作者
press = item[6] # 出版社
price = item[7] # 价格
mysheet.write(i, 0, No)
mysheet.write(i, 1, image)
mysheet.write(i, 2, book_title)
mysheet.write(i, 3, reviews)
mysheet.write(i, 4, recommend)
mysheet.write(i, 5, writer)
mysheet.write(i, 6, press)
mysheet.write(i, 7, price)
i += 1
workbook.save('dangdang.xls')
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return "异常"
def parse_result(html):
pattern = re.compile(
'<li>.*?list_num.*?>(.*?).</div>.*?<img.*?src="(.*?)".*?class="name".*?title="(.*?)".*?class="level".*?target="_blank">(.*?)</a>.*?class="tuijian">(.*?)</span>.*?class="publisher_info".*?title="(.*?)".*?</a>.*?class="publisher_info".*?target="_blank">(.*?)</a>.*?class="price_n">¥(.*?)</span>.*?</li>',
re.S)
items = re.findall(pattern, html)
return items
if __name__ == '__main__':
main()
第二种是用pandas模块,这个写的代码比较少,但是一问题有点多。我之前写的时候后面的会把前面的覆盖掉,所以我最终用了with open 用了csv模块
第二种方式源代码
import requests
from lxml import etree
import csv
def get_info(url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'
}
response = requests.get(url, headers=headers).text
result = etree.HTML(response)
# xpath解析获取内容
bookname = result.xpath('//div[@class="name"]/a/@title')[0]
image = result.xpath('//div[@class="pic"]/a/img/@src')[0]
recommend = result.xpath('//div[@class="star"]/a/text()')[0]
writer = result.xpath('//div[@class="publisher_info"]/a/text()')[0]
price = result.xpath('//div[@class="price"]/p/span[@class="price_n"]/text()')[0]
with open('dangdang.csv','w',newline='',encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow([bookname,image,recommend, writer, price])
for value in zip(bookname,image,recommend, writer, price):
writer.writerow(value)
def run():
page = 0
while 1:
url = 'http://bang.dangdang.com/books/bestsellers/1-' + str(page)
get_info(url)
page += 1
break
if __name__ == '__main__':
run()