def clean(line):
import re
def _valid_chr(byte_code):
'''
0xE2 0xBA 0x80 - 0xE2 0xBB 0xB3 共 115 个
0xE2 0xBC 0x80 - 0xE2 0xBF 0x95 共 213 个
0xE3 0x80 0x85 - 0xE3 0x80 0xA9 共 36 个
0xE3 0x80 0xB8 - 0xE4 0xB6 0xB5 共 7549 个
0xE4 0xB8 0x80 - 0xEF 0xA9 0xAA 共 44138 个
0xEF 0xA9 0xB0 - 0xEF 0xAB 0x99 共 105 个
0xF0 0xA0 0x80 0x80 - 0xF0 0xAF 0xA8 0x9D 共 64029 个
'''
if byte_code >= b'a' and byte_code <= b'z':
return True
if byte_code >= b'0' and byte_code <=b'9':
return True
if byte_code > b'\xe2\xba\x80' and byte_code < b'\xe2\xbb\xb3':
return True
if byte_code > b'\xe2\xbc\x80' and byte_code < b'\xe2\xbf\x95':
return True
if byte_code > b'\xe3\x80\x85' and byte_code < b'\xe3\x80\xa9':
return True
if byte_code > b'\xe3\x80\xb8' and byte_code < b'\xe4\xb6\xb5':
return True
if byte_code >= b'\xe4\xb8\x80' and byte_code < b'\xef\xa9\xaa':
return True
if byte_code > b'\xef\xa9\xb0' and byte_code < b'\xef\xab\x99':
return True
if byte_code > b'\xf0\xa0\x80\x80' and byte_code < b'\xf0\xaf\xa8\x9d':
return True
return False
SPECIAL_SYM = "[{}]+".format("""\\n\\t!?。"#$%&'()*+-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿☺😘😔😔😜😜😪😭😰🍶🍧🌄🌅🌊🗻🍚🍣🍦–—‘’‛“”„‟…‧﹏""")
line = line.lower()
line = re.sub(SPECIAL_SYM, "", line)
line = "".join(filter(lambda x: _valid_chr(x.encode('utf8')), line))
return line