import csv
import os
import re
import pandas as pd
import requests
import random
import time
import json
from pymysql import *
from lxml import etree
from sqlalchemy import create_engine
from datetime import datetime, timedelta
engine = create_engine('mysql+pymysql://root:root@localhost:3306/doubanmovie')
class spider(object):
def init(self):
try:
conn = connect(host='localhost', user='root', password='root', database='doubanmovie', port=3306, charset='utf8mb4')
sql = '''
create table movies(
id int primary key auto_increment,
directors varchar(2555),
rate varchar(255),
title varchar(255),
casts varchar(255),
cover varchar(255),
year varchar(255),
types varchar(255),
country varchar(255),
lang varchar(255),
time varchar(255),
movieTime varchar(255),
comment_len varchar(255),
starts varchar(255),
summary varchar(2555),
comments text,
imgList varchar(2555),
movieUrl varchar(255),
detailLink varchar(255)
)engine = innodb DEFAULT CHARSET=utf8
'''
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
except:
pass
def clear_csv(self):
# df = pd.read_csv('某条movies_已清洗.csv')
# 读取CSV文件
df = pd.read_csv('movies.csv')
# 将详情页链接字段移到最后一列
df = df[[col for col in df.columns if col != 'detailLink'] + ['detailLink']]
# 1. 加载CSV文件到DataFrame中
# df = pd.read_csv('movies.csv')
# 2. 使用正则表达式筛选出包含汉字和括号的行
pattern = re.compile(r'[\u4e00-\u9fa5\(\)]')
mask = df['time'].str.contains(pattern)
# 3. 删除筛选出的行数据
df = df[~mask]
# 删除只有年份的数据
df = df[df['time'].str.len() > 4]
# 将时间格式统一改为%Y-%m-%d形式
df['time'] = pd.to_datetime(df['time'], errors='coerce').dt.strftime('%Y-%m-%d')
# # 4. 将处理后的DataFrame保存回CSV文件中
# df.to_csv('movies_cleaned.csv', index=False)
# df.to_csv('movies_modified.csv', index=False)
df.dropna(inplace=True) #删除缺失值
# df.fillna(method='ffill', inplace=True) #填充缺失值 两种都可以
# df.ffill(inplace=True)
df.drop_duplicates() # 删除重复值
df.to_csv('movies_cleaned3.csv', mode='w', index=False)
# 读取CSV文件
# df = pd.read_csv('某条movies.csv')
# df = pd.read_csv('movies_cleaned.csv')
# print(df.shape)
self.save_to_sql(df)
def save_to_sql(self,df):
# df = pd.read_csv('movies.csv')
# df.to_sql('movies_cleaned4', con=engine, index=False, if_exists='replace')
if __name__ == '__main__':
spiderObj = spider()
spiderObj.init()
# spiderObj.get_page()
# spiderObj.spiderMain()
spiderObj.clear_csv()
# spiderObj.save_to_sql()
movies.csv表中没有id列,但是我要求数据库中第一列必须是id列,使用上述代码,总是没有id列怎么修改呢
修改如下
在创建DataFrame时,您可以手动添加一个id列,然后将其作为第一列。您可以使用`insert`方法来插入该列,然后将其设置为自增的主键。以下是您可以进行修改的部分:
```python
class spider(object):
def init(self):
try:
conn = connect(host='localhost', user='root', password='root', database='doubanmovie', port=3306, charset='utf8mb4')
sql = '''
create table movies(
id int primary key auto_increment,
directors varchar(2555),
rate varchar(255),
title varchar(255),
casts varchar(255),
cover varchar(255),
year varchar(255),
types varchar(255),
country varchar(255),
lang varchar(255),
time varchar(255),
movieTime varchar(255),
comment_len varchar(255),
starts varchar(255),
summary varchar(2555),
comments text,
imgList varchar(2555),
movieUrl varchar(255),
detailLink varchar(255)
)engine = innodb DEFAULT CHARSET=utf8
'''
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
except Exception as e:
print("An error occurred while creating table:", e)
def clear_csv(self):
try:
# 读取CSV文件
df = pd.read_csv('movies.csv')
# 添加id列并设置为自增主键
df.insert(0, 'id', range(1, len(df) + 1))
# 将详情页链接字段移到最后一列
df = df[[col for col in df.columns if col != 'detailLink'] + ['detailLink']]
# 1. 加载CSV文件到DataFrame中
# df = pd.read_csv('movies.csv')
# 2. 使用正则表达式筛选出包含汉字和括号的行
pattern = re.compile(r'[\u4e00-\u9fa5\(\)]')
mask = df['time'].str.contains(pattern)
# 3. 删除筛选出的行数据
df = df[~mask]
# 删除只有年份的数据
df = df[df['time'].str.len() > 4]
# 将时间格式统一改为%Y-%m-%d形式
df['time'] = pd.to_datetime(df['time'], errors='coerce').dt.strftime('%Y-%m-%d')
# # 4. 将处理后的DataFrame保存回CSV文件中
# df.to_csv('movies_cleaned.csv', index=False)
# df.to_csv('movies_modified.csv', index=False)
df.dropna(inplace=True) #删除缺失值
# df.fillna(method='ffill', inplace=True) #填充缺失值 两种都可以
# df.ffill(inplace=True)
df.drop_duplicates(inplace=True) # 删除重复值
df.to_csv('movies_cleaned3.csv', mode='w', index=False)
# 读取CSV文件
# df = pd.read_csv('某条movies.csv')
# df = pd.read_csv('movies_cleaned.csv')
# print(df.shape)
self.save_to_sql(df)
except Exception as e:
print("An error occurred while cleaning CSV and saving to SQL:", e)
def save_to_sql(self, df):
try:
df.to_sql('movies', con=engine, index=False, if_exists='replace')
except Exception as e:
print("An error occurred while saving DataFrame to SQL:", e)
if __name__ == '__main__':
spiderObj = spider()
spiderObj.init()
# spiderObj.get_page()
# spiderObj.spiderMain()
spiderObj.clear_csv()
# spiderObj.save_to_sql()
```
这样就会在DataFrame中添加一个自增的id列,并将其作为第一列,然后将DataFrame保存到数据库时,该id列将成为数据库表中的第一列。