1 爬取拼音和笔顺
拼音爬自https://zidian.900cha.com/。数据文件汉字拼音带音标和笔顺共20842字(“壭亪寽兯嚸”这五个字没收)
笔顺爬自http://bs.kaishicha.com/。数据文件汉字笔顺共20842字(“壭亪寽兯嚸”这五个字没收)
public class CharUnit
{
/// <summary>
/// 汉字
/// </summary>
public char Char;
/// <summary>
/// 偏旁部首
/// </summary>
public char Radical;
/// <summary>
/// 总笔画数
/// </summary>
public byte StrokeCount;
/// <summary>
/// 笔顺
/// </summary>
public string Strokes;
/// <summary>
/// 拼音个数
/// </summary>
public byte PinyinCount;
/// <summary>
/// 拼音
/// </summary>
public string[] PinyinList;
public static CharUnit Deserialize(BinaryReader binaryReader)
{
var charUnit = new CharUnit();
charUnit.Char = binaryReader.ReadChar();
charUnit.Radical = binaryReader.ReadChar();
charUnit.StrokeCount = binaryReader.ReadByte();
charUnit.Strokes = binaryReader.ReadString();
charUnit.PinyinCount = binaryReader.ReadByte();
charUnit.PinyinList = new string[(int)charUnit.PinyinCount];
for (int i = 0; i < (int)charUnit.PinyinCount; i++)
{
charUnit.PinyinList[i] = binaryReader.ReadString();
}
return charUnit;
}
public void Serialize(BinaryWriter binaryWriter)
{
binaryWriter.Write(this.Char);
binaryWriter.Write(this.Radical);
binaryWriter.Write(this.StrokeCount);
binaryWriter.Write(this.Strokes);
binaryWriter.Write(this.PinyinCount);
for (int i = 0; i < (int)this.PinyinCount; i++)
{
binaryWriter.Write(this.PinyinList[i]);
}
}
}
2 vs2019新建.net core console项目,NuGet导入
Microsoft.EntityFrameworkCore //ef core
Microsoft.EntityFrameworkCore.Design //在nuget
Microsoft.EntityFrameworkCore.Tools //控制台中管理数据迁移
Microsoft.EntityFrameworkCore.Sqlite //sqlite
Microsoft.EntityFrameworkCore.Sqlite.Core //sqlite
HtmlAgilityPack //xpath
3 共五个表:汉字、部首、笔顺、拼音、拼音汉字many-to-many辅助表。部首和汉字是one-to-many,笔顺和汉字是one-to-one。
public class ChineseChar
{
public ChineseChar()
=> PinYins = new JoinCollectionFacade<PinYin, PinYinChar>(
PinYinChars,
pyc => pyc.PinYin,
py => new PinYinChar { PinYin = py, ChineseChar = this });
public int ChineseCharId { get; set; }
[Column(TypeName = "NCHAR(1)"), Required]
public