爬虫爬到了想要的数据,下一步该想的就是如何去存储/以什么样的存储格式去存储
这一次我将用一个实例展示4种数据存储方法:CSV、EXCEL、TXT、图片(python3.6)
我们爬取前程无忧招聘页面,而要获取信息是:公司、职位、地点、薪水、发布日期。
import requests
from lxml import etree
url ='https://search.51job.com/list/080200,000000,0000,32,9,99,%2B,2,1.html'
header = {
"User-Agent": "User-Agent:Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Connection": "keep - alive",
}
response=requests.get(url,headers =header)
html=etree.HTML(response.content.decode('gbk','ignore'))
connect_list = []
lists = html.xpath("//div[@class ='el']")
print(lists)
for list in lists:
item = {}
item['job'] = ''.join(list.xpath("./p/span/a/@title"))
item['company'] = ''.join(list.xpath("./span[@class = 't2']/a/@title"))
item['locate'] = ''.join(list.xpath("./span[@class = 't3']/text()"))
item['salary'] = ''.join(list.xpath("./span[@class = 't4']/text()"))
item['time'] = ''.join(list.xpath("./span[@class = 't5']/text()"))
#"".join(...)将列表转换为字符串的数据形式,方便保存到excel表格中
connect_list.append(item)
以上我们已经初步获取了想要的数据,接下来要做的就是储存!!
- EXCEL
保存到excel文件中(.xlsx)
from openpyxl import Workbook //导入包
wb = Workbook() # class实例化
ws = wb.active # 激活工具表
ws.append(['job', 'company', 'locate', 'salary', 'time']) # 添加对应的表头
for connect in connect_list[4:]:
ws.append([connect['job'], connect['company'], connect['locate'], connect['salary'], connect['time']]) //添加到表中
wb.save('job2.xlsx') //保存
- CSV
保存到csv文件中(.csv)
import csv //导入包
lists=[]
for connect in connect_list[4:]:
l=[connect['job'],connect['company'],connect['locate'],connect['salary'],connect['time']]
lists.append(l) //将每一条数据按序排列存入一个列表中
with open("job.csv", "w", encoding="utf-8",newline="") as f: //如果不加newline会出现空行
k = csv.writer(f, dialect="excel")
k.writerow(['job', 'company', 'locate', 'salary', 'time'])
for list in lists:
k.writerow(list)
- TXT
保存到文本文件(.txt)
import json
with open('3.txt', 'a', encoding='utf8') as f:
for c in connect_list[4:]:
f.write(json.dumps(c, ensure_ascii=False)) //json.dumps()是将字典转化为字符串
f.write('\n')
- 图片
保存到图片的形式(.jpg/.png)因为该页面没有图片链接,我们另取一图片链接进行保存。
import urllib
a='https://img.kanzhun.com/images/logo/20150418/1a3506ade5b0bd337d8036e2d3b16351.jpg' //完整的要爬取的图片地址链接
bytes = urllib.request.urlopen(a) //下载该图片
fp = open('Ali.jpg', "wb") //以字节流的形式写入并保存
fp.write(bytes.read())
fp.flush()
fp.close()