python 存储_Python爬取之数据存储

3b067f4d7981fa5b805f0a6ccc30b383.png

在大家学习Python爬虫的过程中,必然少不了将爬取的数据进行存储,现在最常存储的方式有存储TXT文本、CSV文件、Excel文件、数据仓库这三种,今天我们就来一一介绍一下如何存储。

1、TXT文本存储

f=open(path+’filename.txt’,’a+’)

f.write(info+’n’)

f.close()

《斗破苍穹》文本存储代码如下:

import requests
import re
import time

headers={
"User-Agent": "请求头"
}

f=open('doupo.txt','a+')

def get_info(url):
    res=requests.get(url,headers=headers)
    if res.status_code==200:
        contents = re.findall('<p>(.*?)<p>',res.content.decode('utf-8'),re.S)
        for content in contents:
            f.write(content+'n')
            print(content)
    else:
        pass

if __name__=='__main__':
    urls=['http://www.doupoxs.com/doupocangqiong/{}.html'.format(i) for i in range(2,4)]
    for url in urls:
        get_info(url)
        time.sleep(1)
    f.close()

运行结果如下:

67b1abce48c76840fb3eccaa781bea1a.png

2、CSV文件存储

import csv

fp=open(path+’/filename.csv’,’wt’,newline=’’,encoding=’utf-8’)

writer=csv.writer(fp)

writer.writerow((‘name’,’url’))

fp.close()

《豆瓣网图书TOP1》CSV存储代码如下:

from lxml import etree
import requests
import csv

fp=open('doubanbooktop.csv','wt',newline='',encoding='utf-8')

writer=csv.writer(fp)
writer.writerow(('name','book_infos'))
urls=['https://book.douban.com/top250?start={}'.format(str(i)) for i in range(0,25,25)]

headers={
"User-Agent": "请求头"
}

for url in urls:
    html = requests.get(url,headers=headers)
    selector = etree.HTML(html.text)
    infos=selector.xpath('//tr[@class="item"]')
    print(infos)

    for info in infos:
        name=info.xpath('td/div[1]/a/text()')[0]
        book_infos=info.xpath('td/p/text()')[0]
        rate=info.xpath('td/div/span[2]/text()')[0]
        print(name,book_infos)
        writer.writerow((name))
    fp.close()

运行结果如下:

(1)乱码格式的csv

8ca084cbc4260490f2cfd427267eb875.png

(2)转换文本存储后

a45f85fb18310edd39533faf0d89c823.png

3、Excel文件存储

import xlwt

book=xlwt.Workbook(encoding=’utf-8’)

sheet=book.add_sheet(‘sheet1’)

sheet.write(0,0,’python’)

sheet.write(1,1,’love’)

book.save(‘text.xls’)

《起点中文网》Excel存储代码如下:

import xlwt
from lxml import etree
import requests
import time

# 初始化列表,存入爬虫数据
headers={
"User-Agent": "请求头"
}

all_info_list = []  # 初始化列表,存入爬虫数据

def get_info(url):
    html = requests.get(url, headers=headers)
    selector = etree.HTML(html.text)
    infos=selector.xpath('//ul[@class="all-img-list cf"]/li')
    #定位大标签,以此循环
    for info in infos:
        title=info.xpath('div[2]/h4/a/text()')[0]
        author=info.xpath('div[2]/p[1]/a[1]/text()')[0]
        style_1 = info. xpath('div[2]/p[1]/a[2]/text()')[0]
        style_2 = info. xpath('div[2]/p[1]/a[3]/text()')[0]
        style = style_1+'·'+ style_2
        complete = info.xpath('div[2]/p[1]/span/text()')[0]
        introduce = info.xpath('div[2]/p[2]/text()')[0].strip()
        word = info.xpath('div[2]/p[3]/span/text()')[0].strip('万字')
        print(title, author,style,complete,introduce,word)
        info_list = [title, author, style, complete, introduce, word]
        all_info_list.append(info_list)
    time.sleep( 1)

if __name__ == '__main__':
    # 程序 主 入口
    urls = ['http://a.qidian.com/?page={}'.format(str(i)) for i in range(1, 2)]
    for url in urls:
        get_info(url)
        header = ['title','author','style','complete','introduce','word']
        book =xlwt.Workbook(encoding='utf-8')
        #创建工作簿 
        sheet = book.add_sheet('sheet1')
        for h in range(len(header)):
            sheet.write(0,h,header[h])
        i = 1
        for list in all_info_list: 
            j = 0
            for data in list:
                sheet.write(i,j,data) 
                j +=1 
            i +=1
            book.save('qidianxiaoshuo.xls') #保存文件

运行结果如下:

87805b3ebacfcdce9e0ee8870fa1db2f.png

4、数据库存储

import pymysql

conn=pymysql.connect(host=’localhost’,user=’root’,passwd=’123456’,db=’pymysql_name’,port=3306,charset=’utf8’)

cursor=conn.cursor()

cursor.execute(‘insert into table(name,info1)values(%s,%s)’,(name,info1))

conn.commit()

《豆瓣电影TOP》数据库存储代码如下

import requests
from lxml import etree
import re
import pymysql
import time

db = pymysql.connect(host='localhost', user='root', passwd=密码, db='库名称, port=3306, charset='utf8')
cursor = db.cursor()

headers={"User-Agent": "请求头"}

def get_movie_url(url):
    html=requests.get(url,headers=headers)
    selector=etree.HTML(html.text)
    movie_infos=selector.xpath('//div[@class="hd"]/a/@href')
    for movie_url in movie_infos:
        get_movie_info(movie_url)
def get_movie_info(url):
    html = requests.get(url, headers=headers)
    selector = etree.HTML(html.text)
    try:
        name=selector.xpath('//*[@id="content"]/h1/span[1]/text()')[0]
        director=selector.xpath('//*[@id="info"]/span[1]/span[2]/a/text()')[0]
        cursor.execute("insert into doubanmovie (name,director) values(%s,%s)",(str(name),str(director)))
    except IndexError:
        pass

if __name__=='__main__':
    urls=['https://movie.douban.com/top250?start={}&filter='.format(i) for i in range(0,25,25)]
    for url in urls:
        get_movie_url(url)
        time.sleep(2)
    db.commit()

运行结果如下:

d98c6464ba5693edb45a9f9346c8f338.png

以上就是python爬取后数据存储的各种方法,大家可以根据学习过程中的需要随时切换使用来不断完善技能。

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值