scrapy爬虫-数据预处理pipelines与pandas

settings.py进行注册开启ConnectMysqlPipeline

ITEM_PIPELINES = {
   'caifu_gp.pipelines.CaifuGpPipeline': 300,
   'caifu_gp.pipelines.ConnectMysqlPipeline': 300,
}

pipelines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
​
​
# useful for handling different item types with a single interface
​
from itemadapter import ItemAdapter
import pymysql
​
# price,fluctuation,rise,turnover,amount,volume,rate
class CaifuGpPipeline:
    def process_item(self, item, spider):
        if item['price'] == '-':
            item['price'] = ''
            item['fluctuation'] = ''
            item['rise'] = ''
            item['turnover'] = ''
            item['amount'] = ''
            item['volume'] = ''
            item['rate'] = ''
        return item
​
class ConnectMysqlPipeline:
    # 连接数据库
    def __init__(self):
        self.db = pymysql.connect(
            host='localhost',
            user='root',
            password='root',
            port=3306,
            db='test',
            charset='utf8',
            )
        self.cursor = self.db.cursor()
        # code varchar(255) int not null primary key,设置主键也可以避免重复数据
        sql = 'create table if not exists gp(code varchar(255),name varchar(100),market varchar(100),price double,fluctuation double,rise double,turnover int(255),amount double(255,1),volume double,rate double)'
        self.cursor.execute(sql)
    def process_item(self, item, spider):
        insert_sql = """insert into gp(code,name,market,price,fluctuation,rise,turnover,amount,volume,rate)
                values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""
        select_sql = """select * from gp where code=%s"""
        # 插入数据
        try:
            # 查重
            self.cursor.execute(select_sql, item['code'])
            result = self.cursor.fetchall()
            if result:
                print("数据重复")
            else:
                self.cursor.execute(insert_sql,
                (
                item['code'],
                item['name'],
                item['market'],
                item['price'],
                item['fluctuation'],
                item['rise'],
                item['turnover'],
                item['amount'],
                item['volume'],
                item['rate'],
                ))
        except Exception as e:
            print(e)
            self.db.rollback()
            self.cursor.execute("""insert into gp(code,name,market)
                values(%s, %s, %s)""", (
                item['code'],
                item['name'],
                item['market']))
        # 提交sql语句
        self.db.commit()
        # 删除空值
        sql = "delete from gp where price is null"
        self.cursor.execute(sql)
        return item
    # 关闭数据库连接
    def close_spider(self, spider):
        self.cursor.close()
        self.db.close()
​

MySQL删除空值

delete from table where username is null

pandas数据预处理

import pandas as pd
​
#  将不符合的数值转为NaN
na_value = ['0']
df = pd.read_csv('data.csv', na_values=na_value,  converters={'code': str})
print(df['price'])
## 删除包含空数据的行
# df1 = df.dropna()
## inplace = True修改源数据       df1 = df.dropna(inplace = True)
# print(df1.to_string())
## subset=['列名字段']移除指定列有空值的行
# df2 = df.dropna(subset=['price'])
# print(df2.to_string())
##  fillna() 方法来替换一些空字段
# df3 = df.fillna(123)
# print(df3.to_string())
## 指定某一个列来替换数据:
# df3 = df['price'].fillna(123)
# print(df3.to_string())
## 使用 mean() 方法计算列的均值并替换空单元格
## 使用 mean()、median() 和 mode() 方法计算列的均值(所有值加起来的平均值)、中位数值(排序后排在中间的数)和众数(出现频率最高的数)
# x = df['price'].mean()
# df4 = df['price'].fillna(x)
# print(df4.to_string())
## 修改错误数据df.loc
# df5 = df.loc[439, 'price'] = 2
# print(df)
## 批量修改错误数据
# for x in df.index:
#     if df.loc[x, 'price'] > 10:
#         df6 = df.loc[x, 'price'] = 2
# print(df.to_string())
##删除错误数据
# for x in df.index:
#     if df.loc[x, 'price'] > 2:
#         df7 = df.drop(x)
# print(df.to_string())
## 删除重复数据,可以直接使用drop_duplicates() 方法。
# df8 = df.drop_duplicates()
# print(df8.to_string())

pandas读取文件读取开头有0缺失的问题

df = pd.read_csv('data.csv', na_values=na_value, dtype=object)
# 或df = pd.read_csv('data.csv', na_values=na_value, dtype=object, converters={'code': str})  
# dtype=object以字符串形式读取或 converters={'字段名': str}转为字符串形式

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

bug智造

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值