MySQL学习笔记_duplicates:0-CSDN博客

本文链接：https://blog.csdn.net/qq_34531825/article/details/52468007

MySQL在线帮助文档：http://dev.mysql.com/doc/refman/5.7/en/
MySQL完整中文教程：http://www.runoob.com/mysql/mysql-tutorial.html

MySQL数据类型

完整数据类型请参考：http://www.runoob.com/mysql/mysql-data-types.html
数值型

Name	字节	有符号	无符号	用途
TINYINT	1 字节	(-128，127)	(0，255)	小整数值
INT或INTEGER	4 字节	(-2 147 483 648，2 147 483 647)	(0，4 294 967 295)	大整数值
BIGINT	8 字节	(-9 233 372 036 854 775 808，9 223 372 036 854 775 807)	(0，18 446 744 073 709 551 615)	极大整数值
FLOAT	4 字节	(-3.402 823 466 E+38，1.175 494 351 E-38)，0，(1.175 494 351 E-38，3.402 823 466 351 E+38)	0，(1.175 494 351 E-38，3.402 823 466 E+38)	单精度浮点数值
DOUBLE	8 字节	(1.797 693 134 862 315 7 E+308，2.225 073 858 507 201 4 E-308)，0，(2.225 073 858 507 201 4 E-308，1.797 693 134 862 315 7 E+308)	0，(2.225 073 858 507 201 4 E-308，1.797 693 134 862 315 7 E+308)	双精度浮点数值

字符串

Name	范围	说明
CHAR	0-255字节	定长字符串
TEXT	0-65 535字节	长文本数据
LONGTEXT	0-4 294 967 295字节	极大文本数据

从文件中导入表

更完整的信息请参考http://www.jb51.net/article/58093.htm

LOAD DATA [LOW_PRIORITY | CONCURRENT] [LOCAL] INFILE 'file_name.txt'
    [REPLACE | IGNORE] #使用IGNORE对于冲突的数据丢弃掉。
    INTO TABLE tbl_name #会保留原有所有记录，不管重复与否
    [FIELDS
        [TERMINATED BY 'string']
        [[OPTIONALLY] ENCLOSED BY 'char']
        [ESCAPED BY 'char' ]
    ]
    [LINES
        [STARTING BY 'string']
        [TERMINATED BY 'string']
    ]
    [IGNORE number LINES]
    [(col_name_or_user_var,...)]
    [SET col_name = expr,...)]

mysql> create table wiki2(id BIGINT,name Char(255));

mysql> LOAD DATA LOCAL INFILE "E:/data/graphx-wiki-vertices.txt" 
    -> IGNORE #使用IGNORE对于冲突的数据丢弃掉。
    -> into table wiki2 #会保留原有所有记录，不管重复与否
    -> fields terminated by " "
    -> LINES terminated by "\n";

复制表

create table select 会将原表中的数据完整复制一份，但表结构中的索引会丢失。
create table like 只会完整复制原表的建表语句，但不会复制数据

mysql> CREATE table LargeAreaCountry SELECT * from myselect;

mysql> CREATE table LargeAreaCountry2 LIKE myselect;

建立索引

主键：主键是唯一的。一个数据表中只能包含一个主键。你可以使用主键来查询数据。外键：外键用于关联两个表。复合键：复合键（组合键）将多个列作为一个索引键，一般用于复合索引。索引：使用索引可快速访问数据库表中的特定信息。索引是对数据库表中一列或多列的值进行排序的一种结构。类似于书籍的目录。
查看表中已经存在 index：show index from table_name;
索引的创建可以在CREATE TABLE语句中进行，也可以单独用CREATE INDEX或ALTER TABLE来给表增加索引。删除索引可以利用ALTER TABLE或DROP INDEX语句来实现。
（1）使用ALTER TABLE语句创建索引。
alter table table_name add index index_name (column_list) ;
alter table table_name add unique (column_list) ;
alter table table_name add primary key (column_list) ;
其中包括普通索引、UNIQUE索引和PRIMARY KEY索引3种创建索引的格式，table_name是要增加索引的表名，column_list指出对哪些列进行索引，多列时各列之间用逗号分隔。索引名index_name可选，缺省时，MySQL将根据第一个索引列赋一个名称。另外，ALTER TABLE允许在单个语句中更改多个表，因此可以同时创建多个索引。

mysql> alter table tpsc add index shili (tpmc ) ;

（2）使用CREATE INDEX语句对表增加索引。
能够增加普通索引和UNIQUE索引两种。其格式如下：
create index index_name on table_name (column_list) ;
create unique index index_name on table_name (column_list) ;

（3）删除索引。
删除索引可以使用ALTER TABLE或DROP INDEX语句来实现。DROP INDEX可以在ALTER TABLE内部作为一条语句处理，其格式如下：
drop index index_name on table_name ;
alter table table_name drop index index_name ;
alter table table_name drop primary key ;

SELECT


#查询所有记录条数COUNT
mysql> select count(*) from students;

#查询数据表中有多少条内容不重复记录；DISTINCT
mysql> select count(distinct(name)) from students;

#也可以进行一些计算；
mysql> select count(*)/count(distinct(name)) from students;

#限制查询结果中数据列的个数
mysql> select id,name from students;

#限制查询结果中数据记录的个数
mysql> select id,name from students limit 2;

Join

原始表

students表：
+------+--------+-----+
| id   | name   | age |
+------+--------+-----+
| 2510 | wangli |  19 |
| 2514 | legity |  45 |
| 2521 | lesssa |  21 |
| 2530 | Jane   |  50 |
| 2531 | Kitty  |  30 |
| 2535 | 李伟   |  28 |
| 3300 | Sam    |  30 |
+------+--------+-----+

class 表：
+-------+--------+
| class | name   |
+-------+--------+
| 23001 | Jane   |
| 23001 | Kitty  |
| 23001 | 李伟   |
| 23002 | Sam    |
| 23002 | lesss2 |
| 23002 | lessdd |
+-------+--------+

左连接，以左表为准

select * from students left join class on students.name=class.name;

+------+--------+-----+-------+--------+
| id   | name   | age | class | name   |
+------+--------+-----+-------+--------+
| 2530 | Jane   |  50 | 23001 | Jane   |
| 2531 | Kitty  |  30 | 23001 | Kitty  |
| 2535 | 李伟   |  28 | 23001 | 李伟    |
| 3300 | Sam    |  30 | 23002 | Sam    |
| 2510 | wangli |  19 |  NULL | NULL   |
| 2514 | legity |  45 |  NULL | NULL   |
| 2521 | lesssa |  21 |  NULL | NULL   |
+------+--------+-----+-------+--------+

右连接，以右表为准

select * from students  right join class on students.name=class.name;

+------+--------+------+-------+--------+
| id   | name   | age  | class | name   |
+------+--------+------+-------+--------+
| 2530 | Jane   |   50 | 23001 | Jane   |
| 2531 | Kitty  |   30 | 23001 | Kitty  |
| 2535 | 李伟   |   28 | 23001 | 李伟    |
| 3300 | Sam    |   30 | 23002 | Sam    |
| NULL | NULL   | NULL | 23002 | lesss2 |
| NULL | NULL   | NULL | 23002 | lessdd |
+------+--------+------+-------+--------+

默认为内连接 innner，两表交叉（同时存在时才显示）

select * from students join class on students.name=class.name;
+------+--------+-----+-------+--------+
| id   | name   | age | class | name   |
+------+--------+-----+-------+--------+
| 2530 | Jane   |  50 | 23001 | Jane   |
| 2531 | Kitty  |  30 | 23001 | Kitty  |
| 2535 | 李伟   |  28 | 23001 | 李伟    |
| 3300 | Sam    |  30 | 23002 | Sam    |
+------+--------+-----+-------+--------+

删除表

mysql> drop table myselect; #删除整个表

mysql> delete from temp_wiki;#删除表的内容，但是仍然保留表的结构

数据去重

下面测试数据约9万行数据，4倍的重复数据。

#distinc：下面是当所有字段一样时，才进行去重，如果数据量大，不进行优化，将会非常非常慢。
mysql> create table temp_wiki select distinct * from wiki;
Query OK, 22424 rows affected (5 min 6.19 sec)
Records: 22424  Duplicates: 0  Warnings: 0


#group by对某个字段去重，还是非常慢，稍微快一点，还是远远达不到要求
# MySQL中 使用 GROUP BY 关键字用于对某个或某些字段查询分组，并返回重复记录的第一条。
mysql> create table temp_wiki2 select * from wiki group by id;
Query OK, 22424 rows affected (4 min 16.14 sec)
Records: 22424  Duplicates: 0  Warnings: 0

#建立索引后再操作
mysql> create index index_id on wiki(id);#index_id 是自定义的索引名
mysql> create table temp_wiki4 select * from wiki group by id;
Query OK, 22424 rows affected (6 min 41.90 sec)
Records: 22424  Duplicates: 0  Warnings: 0

#插入到一个定义好的空表中，似乎更慢一些。
mysql> insert into temp_wiki select * from wiki group by id;
Query OK, 22424 rows affected (6 min 2.21 sec)
Records: 22424  Duplicates: 0  Warnings: 0

下面测试是25697行数据，有3273行数据重复


mysql> create table temp select distinct * from wiki;
Query OK, 22424 rows affected (6.03 sec)
Records: 22424  Duplicates: 0  Warnings: 0

应该还有更好的方法在MySQL中去重，对于大数据，不要直接在MySQL中操作去重，还是应用Spark-SQL,Hive,HBase等工具吧。对小数据集，处理速度还是可以的。

聚合函数

所有聚合函数请参考：http://dev.mysql.com/doc/refman/5.7/en/group-by-functions.html

Name	Description
AVG()	返回指定列的平均值
COUNT()	返回指定列中非NULL值的个数
COUNT(DISTINCT)	返回指定列中非重复值非NULL值的个数
MAX()	返回指定列的最大值
MIN()	返回指定列的最小值
SUM()	返回指定列的所有值之和
STD()	返回总体标准差
STDDEV()	返回总体标准偏差（同上）
STDDEV_POP()	返回总体标准偏差（同上）
STDDEV_SAMP()	返回样本标偏差
GROUP_CONCAT(col)	返回由属于一组的列值连接组合而成的结果
VAR_POP()	返回总体方差
VARIANCE()	返回总体方差（同上）
VAR_SAMP()	返回样本方差

mysql> select count(1),min(id),max(id),avg(id) from wiki;

Count(1)和Count()实际上的意思是，评估Count（）中的表达式是否为NULL，如果为NULL则不计数，而非NULL则会计数。建议不使用Count( )而是使用Count（1）。

mysql> select count(1) from wiki where id>5000000000000000000;

+----------+
| count(1) |
+----------+
|     9440 |
+----------+
1 row in set (0.01 sec)

mysql> select std(id),stddev(id),stddev_pop(id) from city;
+--------------------+--------------------+--------------------+
| std(id)            | stddev(id)         | stddev_pop(id)     |
+--------------------+--------------------+--------------------+
| 1177.5058386267121 | 1177.5058386267121 | 1177.5058386267121 |
+--------------------+--------------------+--------------------+
1 row in set (0.00 sec)

mysql> select var_pop(id),variance(id) from city;
+--------------------+--------------------+
| var_pop(id)        | variance(id)       |
+--------------------+--------------------+
| 1386519.9999999965 | 1386519.9999999965 |
+--------------------+--------------------+

 var_pop(id)是stddev_pop(id)的平方。
1 row in set (0.00 sec)