mycat2分库分表实验——以ml-latest数据集为例
注:users.csv非ml-latest数据集中的内容,是任课老师便于前端展示而随机生成的文件
由于ubuntu20.04最低支持mysql8,而mycat1与后端flask相连会因为utf8mb4的编码和mysql8废弃的方法导致无法解决的问题,遂尝试使用最新适配mysql8的mycat2。以下是本人的野人献曝。
数据处理总体思想:扔掉冗余数据
- movieId处理
提取出tag、link、movie表共同的movieId(求交集),并且把tag、link、movie表的movieId替换为新的movieId,最后扔掉存在空值的数据tag = pd.read_csv('ml-latest/genome-scores.csv').dropna() movie = pd.read_csv('ml-latest/movies.csv').dropna() link = pd.read_csv('ml-latest/links.csv').dropna() tag.movieId = tag.movieId.astype(int) movie.movieId = movie.movieId.astype(int) link.movieId = link.movieId.astype(int) movieIds = set(tag.movieId).intersection(set(movie.movieId)).intersection(set(link.movieId)) omid2nmid={mid:idx for idx,mid in enumerate(movieIds)} tag.movieId = tag.movieId.apply(lambda x:omid2nmid[x] if x in omid2nmid else np.nan) tag = tag.dropna().sort_values(by=["movieId","tagId"]).groupby('movieId').apply(lambda df:pd.Series(df['relevance'].tolist())) np.save('ml-latest/processed_tags.npy',tag.to_numpy()) toptag = np.argsort(-tag.to_numpy())[:,:3] toptagrelevance = -np.sort(-tag.to_numpy())[:,:3]
- userId处理
先将rating表中的movieId转换为新的movieId,扔掉存在空值的数据,再提取当下rating表userId和user表中userId的交集,并且把rating、user表的userId替换为新的userId,最后扔掉存在空值的数据user = pd.read_csv('ml-latest/users.csv',encoding='gbk').dropna() rating = pd.read_csv('ml-latest/ratings.csv').dropna() rating.movieId = rating.movieId.apply(lambda x:omid2nmid[x] if x in omid2nmid else np.nan) rating.userId = rating.userId.apply(lambda x:np.nan if x>user.shape[0] else x) rating.dropna(inplace=True) ouid2nuid={oid:idx for idx,oid in enumerate(rating.userId.unique())} rating.userId = rating.userId.apply(lambda x:ouid2nuid[x]) user.userId = user.userId.apply(lambda x:ouid2nuid[x] if x in ouid2nuid else np.nan) user.dropna(inplace=True) np.save('ml-latest/processed_users.npy',user.sort_values(by="userId").reset_index(drop=True).to_numpy()) np.save('ml-latest/processed_ratings.npy',rating.sort_values(by="userId").to_numpy())
- movie表处理
把movie的genre用onehot进行编码,然后把movie,genre,link,tag拼起来genres = ["Action","Adventure","Animation","Children's","Comedy","Crime","Documentary","Drama","Fantasy","Film-Noir","Horror","Musical","Mystery","Romance","Sci-Fi","Thriller","War","Western"] genres2id = {genre:i for i,genre in enumerate(genres)} np.save('ml-latest/genres2id.npy',genres2id) movie.movieId = movie.movieId.apply(lambda x:omid2nmid[x] if x in omid2nmid else np.nan) movie = movie.dropna().sort_values(by="movieId") link.movieId = link.movieId.apply(lambda x:omid2nmid[x] if x in omid2nmid else np.nan) link = link.dropna().sort_values(by="movieId") genre = np.array([''.join(['1' if genre in genre_list else '0' for genre in genres]) for genre_list in movie.genres.str.split('|').tolist()]) np.save('ml-latest/processed_movies.npy',np.concatenate([movie.to_numpy()[:,1:-1],genre.reshape(-1,1),link.to_numpy().astype(np.int64)[:,1:],toptag,toptagrelevance],axis=1))
mycat
- mycat2 主要配置文件介绍
. ├── bin #mycat2 执行文件 ├── conf #mycat2 配置文件 │ ├── clusters #集群,用来配置数据库读写分离 │ │ ├── movielens_cluster.cluster.json #movielens结点 │ │ └── prototype.cluster.json #初始结点 │ ├── datasources #数据源,用来配置分布式 │ │ ├── movielens.datasource.json #movielens数据源 │ │ ├── backup.datasource.json #备份数据源 │ │ └── prototypeDs.datasource.json #初始数据源 │ ├── schemas #数据库模式,用来配置数据库分片 │ │ ├── movielens.schema.json #movielens数据库分片配置文件 │ │ ├── mycat.schema.json #mycat数据库 │ │ └── mysql.schema.json #初始数据库 │ ├── server.json #mycat2服务器(虚拟数据库)配置文件 │ ├── sql #一些sql语句便于了解mycat2,替换为自己的sql语句 │ │ ├── create_movie.sql #创建movie分片表 │ │ ├── create_rating.sql #创建rating分片表 │ │ ├── create_user.sql #创建user分片表 │ │ ├── genre.sql #创建genre表并导入数据 │ │ ├── movie.sql #导入movie数据 │ │ ├── rating.sql #导入rating数据 │ │ ├── tag.sql #创建tag表并导入数据 │ │ └── user.sql #导入user数据 │ ├── sqlcaches │ ├── state.json #mycat2备份配置文件 │ ├── users │ │ └── root.user.json #mycat2用户配置文件 ├── lib #mycat2依赖包 └── logs #日志文件 ├── mycat.pid └── wrapper.log
- 以movielens_cluster.cluster.json为例提供读写分离和备份的配置参考
{ "clusterType":"MASTER_SLAVE", "heartbeat":{ "heartbeatTimeout":1000, "maxRetryCount":3, "minSwitchTimeInterval":300, "slaveThreshold":0.0 }, "masters":[ "movielens" ], "maxCon":2000, "name":"movielens_cluster", "readBalanceType":"BALANCE_ALL", "replicas":[ "backup" ], "switchType":"SWITCH" }
- 以movielens.datasource.json为例提供(伪)分布式的数据源配置参考
{ "dbType":"mysql", "idleTimeout":60000, "initSqls":[], "initSqlsGetConnection":true, "instanceType":"READ_WRITE", "maxCon":1000, "maxConnectTimeout":3000, "maxRetryCount":5, "minCon":1, "name":"movielens", "password":"数据库密码", "queryTimeout":30, "type":"JDBC", "url":"jdbc:mysql://127.0.0.1:3306/movielens?serverTimezone=UTC&useUnicode=true&characterEncoding=UTF-8&autoReconnect=true", "user":"root", "weight":0 }
- 以movielens.schema.json为例提供数据库水平分片的参考配置方法和配置文件
注:直接修改配置文件会导致在mycat2虚拟数据库上看不到对应的表,因此需要通过mycat2推荐的通过注释构建的方式来实现-
create_user.sql
/*+ mycat:createTable{ "schemaName":"movielens", "shardingTable":{ "createTableSQL":"CREATE TABLE movielens.user (\n\tuserid INTEGER NOT NULL,\n\tgender VARCHAR(10),\n\tname VARCHAR(255) NOT NULL,\n\tpassword VARCHAR(200) NOT NULL,\n\tcreated_on DATETIME,\n\tlast_login DATETIME,\n\tPRIMARY KEY (userid)\n)", "partition":{ "data":[["movielens_cluster","movielens_1","user_1","0","0","0"],["movielens_cluster","movielens_2","user_2","1","1","1"],["movielens_cluster","movielens_3","user_3","2","2","2"]] }, "function":{ "clazz":"io.mycat.router.mycat1xfunction.PartitionByMod", "properties":{ "count":"3", "columnName":"userid" }, } }, "tableName":"user" } */;
-
create_movie.sql
/*+ mycat:createTable{ "schemaName":"movielens", "shardingTable":{ "createTableSQL":"CREATE TABLE movielens.`movie` (\n\t`movieid` int NOT NULL,\n\t`title` varchar(255) NOT NULL,\n\t`genre` varchar(20) NOT NULL,\n\t`src` varchar(255) DEFAULT NULL,\n\t`imdb` varchar(20) NOT NULL,\n\t`tmdb` varchar(20) NOT NULL,\n\t`tagtop1` int NOT NULL,\n\t`tagtop2` int NOT NULL,\n\t`tagtop3` int NOT NULL,\n\t`relevance1` float NOT NULL,\n\t`relevance2` float NOT NULL,\n\t`relevance3` float NOT NULL,\n\tPRIMARY KEY (`movieid`)\n)", "partition":{ "data":[["movielens_cluster","movielens_1","movie_1","0","0","0"],["movielens_cluster","movielens_2","movie_2","1","1","1"],["movielens_cluster","movielens_3","movie_3","2","2","2"]] }, "function":{ "clazz":"io.mycat.router.mycat1xfunction.PartitionByMod", "properties":{ "count":"3", "columnName":"movieid" }, } }, "tableName":"movie" } */;
-
create_rating.sql
/*+ mycat:createTable{ "schemaName":"movielens", "shardingTable":{ "createTableSQL":"CREATE TABLE movielens.rating (\n\tratingid INTEGER NOT NULL,\n\tuserid INTEGER NOT NULL,\n\tmovieid INTEGER NOT NULL,\n\trating FLOAT NOT NULL,\n\tcreated_on DATETIME,\n\tPRIMARY KEY (ratingid)\n)", "partition":{ "data":[["movielens_cluster","movielens_1","rating_1","0","0","0"],["movielens_cluster","movielens_2","rating_2","1","1","1"],["movielens_cluster","movielens_3","rating_3","2","2","2"]] }, "function":{ "clazz":"io.mycat.router.mycat1xfunction.PartitionByMod", "properties":{ "count":"3", "columnName":"userid" }, } }, "tableName":"rating" } */;
-
执行sql
mysql -uroot -p -Dmovielens -h127.0.0.1 -P8066 mysql>source create_user.sql; mysql>source create_movie.sql; mysql>source create_rating.sql; mysql>exit;
-
最终形成的配置文件
{ "customTables":{}, "globalTables":{}, "normalTables":{ "genre":{ "createTableSQL":"CREATE TABLE movielens.`genre` (\n\t`genreid` int NOT NULL AUTO_INCREMENT,\n\t`genre` varchar(50) NOT NULL,\n\tPRIMARY KEY (`genreid`)\n)", "locality":{ "schemaName":"movielens", "tableName":"genre", "targetName":"prototype" } }, "tag":{ "createTableSQL":"CREATE TABLE movielens.`tag` (\n\t`tagid` int NOT NULL AUTO_INCREMENT,\n\t`tag` varchar(100) DEFAULT NULL,\n\tPRIMARY KEY (`tagid`)\n)", "locality":{ "schemaName":"movielens", "tableName":"tag", "targetName":"prototype" } } }, "schemaName":"movielens", "shardingTables":{ "movie":{ "createTableSQL":"CREATE TABLE movielens.`movie` (\n\t`movieid` int NOT NULL,\n\t`title` varchar(255) NOT NULL,\n\t`genre` varchar(20) NOT NULL,\n\t`src` varchar(255) DEFAULT NULL,\n\t`imdb` varchar(20) NOT NULL,\n\t`tmdb` varchar(20) NOT NULL,\n\t`tagtop1` int NOT NULL,\n\t`tagtop2` int NOT NULL,\n\t`tagtop3` int NOT NULL,\n\t`relevance1` float NOT NULL,\n\t`relevance2` float NOT NULL,\n\t`relevance3` float NOT NULL,\n\tPRIMARY KEY (`movieid`)\n)", "function":{ "clazz":"io.mycat.router.mycat1xfunction.PartitionByMod", "properties":{ "count":"3", "columnName":"movieid" }, "ranges":{} }, "partition":{ "data":[ [ "movielens_cluster", "movielens_1", "movie_1", "0", "0", "0" ], [ "movielens_cluster", "movielens_2", "movie_2", "1", "1", "1" ], [ "movielens_cluster", "movielens_3", "movie_3", "2", "2", "2" ] ] }, "shardingIndexTables":{} }, "rating":{ "createTableSQL":"CREATE TABLE movielens.rating (\n\tratingid INTEGER NOT NULL,\n\tuserid INTEGER NOT NULL,\n\tmovieid INTEGER NOT NULL,\n\trating FLOAT NOT NULL,\n\tcreated_on DATETIME,\n\tPRIMARY KEY (ratingid)\n)", "function":{ "clazz":"io.mycat.router.mycat1xfunction.PartitionByMod", "properties":{ "count":"3", "columnName":"userid" }, "ranges":{} }, "partition":{ "data":[ [ "movielens_cluster", "movielens_1", "rating_1", "0", "0", "0" ], [ "movielens_cluster", "movielens_2", "rating_2", "1", "1", "1" ], [ "movielens_cluster", "movielens_3", "rating_3", "2", "2", "2" ] ] }, "shardingIndexTables":{} }, "user":{ "createTableSQL":"CREATE TABLE movielens.user (\n\tuserid INTEGER NOT NULL,\n\tgender VARCHAR(10),\n\tname VARCHAR(255) NOT NULL,\n\tpassword VARCHAR(200) NOT NULL,\n\tcreated_on DATETIME,\n\tlast_login DATETIME,\n\tPRIMARY KEY (userid)\n)", "function":{ "clazz":"io.mycat.router.mycat1xfunction.PartitionByMod", "properties":{ "count":"3", "columnName":"userid" }, "ranges":{} }, "partition":{ "data":[ [ "movielens_cluster", "movielens_1", "user_1", "0", "0", "0" ], [ "movielens_cluster", "movielens_2", "user_2", "1", "1", "1" ], [ "movielens_cluster", "movielens_3", "user_3", "2", "2", "2" ] ] }, "shardingIndexTables":{} } }, "views":{} }
-