scrapy爬虫-数据预处理pipelines与pandas

最新推荐文章于 2023-04-01 00:16:41 发布

bug智造

最新推荐文章于 2023-04-01 00:16:41 发布

阅读量477

点赞数

分类专栏： Python 文章标签： python 数据库开发语言

本文链接：https://blog.csdn.net/weixin_45971950/article/details/122403199

版权

Python 专栏收录该内容

21 篇文章 12 订阅

订阅专栏

settings.py进行注册开启ConnectMysqlPipeline

ITEM_PIPELINES = {
   'caifu_gp.pipelines.CaifuGpPipeline': 300,
   'caifu_gp.pipelines.ConnectMysqlPipeline': 300,
}

pipelines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface

from itemadapter import ItemAdapter
import pymysql

# price,fluctuation,rise,turnover,amount,volume,rate
class CaifuGpPipeline:
    def process_item(self, item, spider):
        if item['price'] == '-':
            item['price'] = ''
            item['fluctuation'] = ''
            item['rise'] = ''
            item['turnover'] = ''
            item['amount'] = ''
            item['volume'] = ''
            item['rate'] = ''
        return item

class ConnectMysqlPipeline:
    # 连接数据库
    def __init__(self):
        self.db = pymysql.connect(
            host='localhost',
            user='root',
            password='root',
            port=3306,
            db='test',
            charset='utf8',
            )
        self.cursor = self.db.cursor()
        # code varchar(255) int not null primary key,设置主键也可以避免重复数据
        sql = 'create table if not exists gp(code varchar(255),name varchar(100),market varchar(100),price double,fluctuation double,rise double,turnover int(255),amount double(255,1),volume double,rate double)'
        self.cursor.execute(sql)
    def process_item(self, item, spider):
        insert_sql = """insert into gp(code,name,market,price,fluctuation,rise,turnover,amount,volume,rate)
                values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""
        select_sql = """select * from gp where code=%s"""
        # 插入数据
        try:
            # 查重
            self.cursor.execute(select_sql, item['code'])
            result = self.cursor.fetchall()
            if result:
                print("数据重复")
            else:
                self.cursor.execute(insert_sql,
                (
                item['code'],
                item['name'],
                item['market'],
                item['price'],
                item['fluctuation'],
                item['rise'],
                item['turnover'],
                item['amount'],
                item['volume'],
                item['rate'],
                ))
        except Exception as e:
            print(e)
            self.db.rollback()
            self.cursor.execute("""insert into gp(code,name,market)
                values(%s, %s, %s)""", (
                item['code'],
                item['name'],
                item['market']))
        # 提交sql语句
        self.db.commit()
        # 删除空值
        sql = "delete from gp where price is null"
        self.cursor.execute(sql)
        return item
    # 关闭数据库连接
    def close_spider(self, spider):
        self.cursor.close()
        self.db.close()

MySQL删除空值

delete from table where username is null

pandas数据预处理

import pandas as pd

#  将不符合的数值转为NaN
na_value = ['0']
df = pd.read_csv('data.csv', na_values=na_value,  converters={'code': str})
print(df['price'])
## 删除包含空数据的行
# df1 = df.dropna()
## inplace = True修改源数据       df1 = df.dropna(inplace = True)
# print(df1.to_string())
## subset=['列名字段']移除指定列有空值的行
# df2 = df.dropna(subset=['price'])
# print(df2.to_string())
##  fillna() 方法来替换一些空字段
# df3 = df.fillna(123)
# print(df3.to_string())
## 指定某一个列来替换数据：
# df3 = df['price'].fillna(123)
# print(df3.to_string())
## 使用 mean() 方法计算列的均值并替换空单元格
## 使用 mean()、median() 和 mode() 方法计算列的均值（所有值加起来的平均值）、中位数值（排序后排在中间的数）和众数（出现频率最高的数）
# x = df['price'].mean()
# df4 = df['price'].fillna(x)
# print(df4.to_string())
## 修改错误数据df.loc
# df5 = df.loc[439, 'price'] = 2
# print(df)
## 批量修改错误数据
# for x in df.index:
#     if df.loc[x, 'price'] > 10:
#         df6 = df.loc[x, 'price'] = 2
# print(df.to_string())
##删除错误数据
# for x in df.index:
#     if df.loc[x, 'price'] > 2:
#         df7 = df.drop(x)
# print(df.to_string())
## 删除重复数据，可以直接使用drop_duplicates() 方法。
# df8 = df.drop_duplicates()
# print(df8.to_string())

pandas读取文件读取开头有0缺失的问题

df = pd.read_csv('data.csv', na_values=na_value, dtype=object)
# 或df = pd.read_csv('data.csv', na_values=na_value, dtype=object, converters={'code': str})  
# dtype=object以字符串形式读取或 converters={'字段名': str}转为字符串形式