python爬虫CSV文件的使用

一,语法
1.读:reader = csv.reader(csvFile) # 返回的是迭代类型
2.写:writer = csv.writer(csvFile2)
二,小实例
(1)

import csv
import json


def f1():
    with open("./files/mycsv.csv","w",encoding="utf-8") as file:
        csv_writer = csv.writer(file)
        csv_writer.writerow(["sid","sname","sage"])
        csv_writer.writerows([[1,"a",20],[2,"a",20],[3,"a",20]])

def f2():
    with open("./files/tecent.json","r",encoding="utf-8") as file:
       content = json.load(file)
       keys = content[0].keys()
       values = [i. values() for i in content]

    with open("./files/tecent.csv", "w", encoding="utf-8") as file:
        csv_writer = csv.writer(file)
        csv_writer.writerow(keys)
        csv_writer.writerows(values)

def f3():
    pass


if __name__ == '__main__':
    #f1()
    f2()

打开的时候要选择合适的格式,否则会有乱码

(2)

# coding:utf-8
import csv
# 读取csv文件方式1
csvFile = open("csvData.csv", "r")
reader = csv.reader(csvFile) # 返回的是迭代类型
data = []
for item in reader:
  print(item)
  data.append(item)
print(data)
csvFile.close()
# 读取csv文件方式2
with open("csvData.csv", "r") as csvfile:
  reader2 = csv.reader(csvfile) # 读取csv文件,返回的是迭代类型
  for item2 in reader2:
    print(item2)
csvFile.close()
# 从列表写入csv文件
csvFile2 = open('csvFile2.csv','w', newline='') # 设置newline,否则两行之间会空一行
writer = csv.writer(csvFile2)
m = len(data)
for i in range(m):
  writer.writerow(data[i])
csvFile2.close()
# 从字典写入csv文件
dic = {'张三':123, '李四':456, '王二娃':789}
csvFile3 = open('csvFile3.csv','w', newline='')
writer2 = csv.writer(csvFile3)
for key in dic:
  writer2.writerow([key, dic[key]])
csvFile3.close()

三,运用到实战中
爬臭事百科并将数据保存到csv中:

import requests
from lxml import etree
import csv


class QiubaiSpdier:
    """臭事百科爬虫"""

    def __init__(self):
        """初始化参数"""
        self.url_temp = "https://www.qiushibaike.com/8hr/page/{}/"
        self.headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
        }
        self.flag = True

    def get_url_list(self):
        """获取url列表,总13页"""
        return [self.url_temp.format(i) for i in range(1, 14)]

    def parse_url(self, url):
        """解析url"""
        try:
            response = requests.get(url, headers=self.headers)
            return response.content.decode("utf-8")
        except Exception as ex:
            print(ex)
            return ""


    def get_page_content_list(self, html_str):
        """提取每一页的数据"""
        html = etree.HTML(html_str)
        div_list = html.xpath("//div[@id='content-left']/div")  # 分组
        content_list = []
        for div in div_list:
            item = {}
            item["content"] = "".join(div.xpath(".//div[@class='content']/span/text()")).replace("\n","") #文章
            item["author_gender"] = div.xpath(".//div[contains(@class,'articleGender')]/@class")          #性别
            item["author_gender"] = item["author_gender"][0].split(" ")[-1].replace("Icon", "") if len(item["author_gender"]) > 0 else None
            item["auhtor_age"] = div.xpath(".//div[contains(@class,'articleGender')]/text()")#年龄
            item["auhtor_age"] = item["auhtor_age"][0] if len(item["auhtor_age"]) > 0 else None
            item["content_img"] = div.xpath(".//div[@class='thumb']/a/img/@src")    #文章图片
            item["content_img"] = "https:" + item["content_img"][0] if len(item["content_img"]) > 0 else None
            item["author_img"] = div.xpath(".//div[@class='author clearfix']//img/@src")  #用户头像
            item["author_img"] = "https:" + item["author_img"][0] if len(item["author_img"]) > 0 else None
            item["stats_vote"] = div.xpath(".//span[@class='stats-vote']/i/text()")
            item["stats_vote"] = item["stats_vote"][0] if len(item["stats_vote"]) > 0 else None
            content_list.append(item)
        return content_list

    def save_page_content_list(self, content_list): #[{},{},{}]
        """保存数据"""

        if self.flag:
            keys = content_list[0].keys()
            values = [i.values() for i in content_list]
            with open("./files/臭事百科.csv", "a", encoding="utf-8") as file:
                csv_writer = csv.writer(file)
                csv_writer.writerow(keys)
                csv_writer.writerows(values)
            self.flag = False
        else:
            values = [i.values() for i in content_list]
            with open("./files/臭事百科.csv", "a", encoding="utf-8") as file:
                csv_writer = csv.writer(file)
                csv_writer.writerows(values)


    def run(self):
        """主要逻辑"""

        # 1.url_list
        url_list = self.get_url_list()
        # 2.遍历,发送请求,获取响应
        for url in url_list:
            html_str = self.parse_url(url)
            # 3.提取数据
            content_list = self.get_page_content_list(html_str)
            # 4.保存
            self.save_page_content_list(content_list)
            # break


if __name__ == '__main__':
    qiubai = QiubaiSpdier()
    qiubai.run()

  • 0
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值