小说单词统计规则(泰语除外)
iOS 小说单词统计规则算法
import Foundation
final class WordCounter {
static let shared = WordCounter()
private init() {
// 分别读取分割字符和全角字符配置文件
mBreakSpaceRanges = readConfigFile(filePath: "break_spaces")
mFullWidthCharRanges = readConfigFile(filePath: "fullwidth_chars")
}
private var mBreakSpaceRanges = [(Int, Int)]()
private var mFullWidthCharRanges = [(Int, Int)]()
func getWordCount(content: String?) -> Int {
guard let content = content else {
return 0
}
var wordBuffer = ""
var clearBuffer = false
var wordCount = 0
// 按单个字符逐一遍历整个内容串
let chars = Array(content)
let count = chars.count
for (i, c) in chars.enumerated() {
// 是否是单词拆分符号
if (isBreakSpace(c: c)) {
clearBuffer = true
} else {
// 是否是全角字符、全形符号
if (isFullWidthChar(c: c)) {
wordCount += 1
clearBuffer = true
} else {
// 不满足情况时将此次遍历字符加入到Buffer中
wordBuffer.append(c)
}
}
// 末尾字符
if (i == count - 1) {
clearBuffer = true
}
// 单词拆分符号、全角字符、字符末尾时clearBuffer为true
if (clearBuffer) {
clearBuffer = false
// 碰到以上3种情况时需要清空Buffer对象,字数需要累加
if (!wordBuffer.isEmpty) {
wordBuffer.removeAll()
wordCount += 1
}
}
}
return wordCount
}
/// 判断是否是单词拆分符号
private func isBreakSpace(c: Character) -> Bool {
return compareCode(c: c, codeArr: mBreakSpaceRanges)
}
/// 判断是否是全角字符、全形符号
private func isFullWidthChar(c: Character) -> Bool {
return compareCode(c: c, codeArr: mFullWidthCharRanges)
}
/// 判断字符是否在字符集区间内
private func compareCode(c: Character, codeArr: [(Int, Int)]) -> Bool {
let s = String(c).unicodeScalars
let value = s[s.startIndex].value
return codeArr.contains { value >= $0.0 && value <= $0.1 }
}
// 读取配置文件,生成以ValueRange(min-max)为结构的配置列表对象
private func readConfigFile(filePath: String) -> [(Int, Int)] {
var intRanges = [(Int, Int)]()
guard let path = Bundle.main.path(forResource: filePath, ofType: ".txt") else {
return intRanges
}
do {
let data = try String(contentsOfFile: path, encoding: .utf8)
let br = data.components(separatedBy: .newlines)
var result: UInt64 = 0
br.forEach {
if !$0.isEmpty && !$0.hasPrefix("#") {
let min: Int
let max: Int
if $0.contains("-") {
let array = $0.split(separator: "-")
let minString = String(array[0])
let maxString = String(array[1])
Scanner(string: minString).scanHexInt64(&result)
min = Int(result)
Scanner(string: maxString).scanHexInt64(&result)
max = Int(result)
} else {
Scanner(string: $0).scanHexInt64(&result)
min = Int(result)
max = min
}
intRanges.append((min, max))
}
}
} catch {
print(error.localizedDescription)
}
return intRanges
}
}
- break_spaces.txt
0x20
0x3000
0x0A
0x09
0x0B
0x0D
0xA0
- fullwidth_chars.txt
0x80
0x82
0x84-0x89
0x8B
0x91-0x99
0x9B
0xA1
0xA4
0xA7-0xA8
0xAF
0xB0-0xB1
0xB4-0xB8
0xBC-0xBF
0xF7
0xD7
# CJK_UNIFIED_IDEOGRAPHS
0x4E00-0x9FFF
# CJK_COMPATIBILITY_IDEOGRAPHS
0xF900-0xFAFF
# CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
0x3400-0x4DBF
# CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
0x20000-0x2A6DF
# GENERAL_PUNCTUATION
0x2000-0x2018
0x201A-0x206F
# SPACING_MODIFIER_LETTERS
0x02B0-0x02FF
# CJK_SYMBOLS_AND_PUNCTUATION
0x3000-0x303F
# CJK_COMPATIBILITY
0x3300-0x33FF
# CJK_COMPATIBILITY_FORMS
0xFE30-0xFE4F
# HANGUL_JAMO
0x1100-0x11FF
# CURRENCY_SYMBOLS
0x20A0-0x20CF
# DINGBATS
0x2700-0x27BF
# GEOMETRIC_SHAPES
0x25A0-0x25FF
# HANGUL_SYLLABLES
0xAC00-0xD7AF
# HANGUL_COMPATIBILITY_JAMO
0x3130-0x318F
# HANGUL_JAMO_EXTENDED_A
0xA960-0xA97F
# HANGUL_JAMO_EXTENDED_B
0xD7B0-0xD7FF
# VERTICAL_FORMS
0xFE10-0xFE1F
# KATAKANA_PHONETIC_EXTENSIONS
0x31F0-0x31FF
# KATAKANA
0x30A0-0x30FF
# HIRAGANA
0x3040-0x309F
# HALFWIDTH_AND_FULLWIDTH_FORMS
0xFF00-0xFFEF
# Combining Diacritical Marks
0x0300-0x036F
iOS 泰语字数统计规则算法 (需要泰语库)
import Foundation
final class ThaiWordCount {
private var thaiWords = [String: Bool]()
private var wordTree = NSMutableDictionary()
private var compoundWords = [String: [String]]()
static let shared = ThaiWordCount()
private init() {
let url = Bundle.main.url(forResource: "thaiDicts.json", withExtension: nil)!
do {
let data = try Data(contentsOf: url)
let json = try JSONSerialization.jsonObject(with: data, options: []) as! [String]
readDictionry(json)
} catch {
print(error.localizedDescription)
}
}
// 获取分词长度
func getTokenSize(content: String?) -> Int {
guard let content = content else {
return 0
}
return tokenize(content: content).count
}
}
/// 字典树
extension ThaiWordCount {
// 读取词典数据并生成单词数
private func readDictionry(_ words: [String]) {
for var word in words {
if word.count > 0 {
if word.contains(",") {
let compoundWord = word.split(separator: ":").map { String($0) }
word = compoundWord[0]
compoundWords[word] = compoundWord[1].split(separator: ",").map { String($0) }
}
thaiWords[word] = true
generateWordTree(word: word)
}
}
}
// 生成单词树
private func generateWordTree(word: String) {
var path = wordTree
for c in word {
if (path[String(c)] == nil) {
path[String(c)] = NSMutableDictionary()
}
path = path[String(c)] as! NSMutableDictionary
}
}
// 查询单词树
private func queryWordTree(word: String) -> Bool {
var isFound = true
var path = wordTree
for c in word {
if path[String(c)] == nil {
isFound = false
break
}
path = path[String(c)] as! NSMutableDictionary
}
return isFound
}
}
extension ThaiWordCount {
// 分词
private func tokenize(content: String) -> [String] {
let lowerCaseContent = convertLowerCase(content: content)
let filteredContent = filterSymbols(content: lowerCaseContent)
let workingArray = filteredContent.split(separator: " ").map { String($0) }
var result = [String]()
let pattern = "[ก-๙]"
let regex = try! NSRegularExpression(pattern: pattern, options: [])
for str in workingArray {
if regex.numberOfMatches(in: str, options: [], range: NSRange(location: 0, length: str.count)) > 0 {
let thaiTokens = breakThaiWords(word: str)
for thaiToken in thaiTokens {
if thaiToken.count > 0 {
result.append(thaiToken)
}
}
}
else {
if str.count > 0 {
result.append(str)
}
}
}
return result
}
// 泰语单词拆分
private func breakThaiWords(word: String) -> [String] {
var words = [String]()
var index = 0
var currentWord = ""
var spareWord = ""
var badWord = ""
var nextWordAble = false;
for c in word {
let checkWord = currentWord + String(c)
if queryWordTree(word: checkWord) {
currentWord = checkWord
if let _ = thaiWords[currentWord] {
if badWord != "" {
if words.count > index {
words[index] = (badWord as NSString).substring(with: NSRange(location: 0, length: badWord.count - 1))
} else {
words.append((badWord as NSString).substring(with: NSRange(location: 0, length: badWord.count - 1)))
}
badWord = ""
index += 1
}
if let brokenWords = compoundWords[checkWord] {
for brokenWord in brokenWords {
words[index] = brokenWord
index += 1
}
index -= 1
}
else {
if words.count > index {
words[index] = checkWord
} else {
words.append(checkWord)
}
}
spareWord = ""
}
else {
spareWord += String(c)
}
nextWordAble = true
}
else {
if nextWordAble {
nextWordAble = false
currentWord = spareWord + String(c)
spareWord = String(c)
index += 1
}
else {
if badWord == "" {
badWord = currentWord + String(c)
}
else {
badWord += String(c)
}
currentWord = String(c)
}
}
}
if badWord != "" {
words.append(badWord)
}
return words
}}
extension ThaiWordCount {
// 过滤符号
private func filterSymbols(content: String) -> String {
var result = content
let pattern = "[^a-z0-9ก-๙]"
let regex = try! NSRegularExpression(pattern: pattern, options: .caseInsensitive)
result = regex.stringByReplacingMatches(in: result, options: [], range: NSRange(location: 0, length: result.count), withTemplate: " ")
return result
}
// 转换成小写
private func convertLowerCase(content: String) -> String {
return content.lowercased()
}
}
跨平台实现 Swift 调用 Javascript 算法
js实现字数统计,三端统一算法,以下调oc调js的方式:
import Foundation
import JavaScriptCore
final class WordCounterFromJS: NSObject {
static let shared = WordCounterFromJS()
private let vm = JSVirtualMachine()
private let context: JSContext
private override init() {
let jsCode = try? String(contentsOf: Bundle.main.url(forResource: "stary-wordcount", withExtension: "js")!)
self.context = JSContext(virtualMachine: self.vm)
self.context.exceptionHandler = { context, exception in
print("JS Error:\(exception.debugDescription)")
}
self.context.evaluateScript(jsCode)
}
func getWordCount(content: String?) -> Int {
guard let content = content else {
return 0
}
let wordCounter = self.context.objectForKeyedSubscript("WordCount")
let result = wordCounter?.invokeMethod("getWordCount", withArguments: [content, true])
return Int(result?.toInt32() ?? 0)
}
}