Python避坑指南
1. Crontab 配置 Python 定时任务
使用 Python 3 写脚本,但是在 Crontab 中配置定时任务,脚本包:UnicodeEncodeError: ‘ascii’ codec can’t encode characters in position 0-1: ordinal not in range(128)
按理说,使用 Python 3 不应该再有这种错误,Python3 已经全部自动使用了 Unicode编码,这种错误是由于编码不一致导致的。具体原因,我怀疑是 Crontab 运行定时任务时,调用的系统语言环境不是 UTF-8 编码导致的。
解决办法:
- 设置系统编码;
export LC_ALL="en_US.utf8"
- Python脚本中重定向编码;
import sys
import codecs
sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
Reference: https://www.cnblogs.com/lsdb/p/12470739.html
2. 字符编码
推荐使用 Python 3,如果因为项目原因必须,Python 2, 编码遵循如下 3 点:
2.1 文件编码
import sys
reload(sys)
sys.setdefaultencoding("UTF-8")
2.2 中文字符串编码
所有字符串都添加 u’’
2.3 字符串的 isinstance
- Python 2 中使用:insistance(u’一个字符串’, basestring)
- Python 3 中使用:insistance(u’一个字符串’, str)
2.4 查找字符串
查找使用的是 >= 0,而不是 > 0.
`u'一个字符串'`.find(u'一个') >= 0
2.5 判断是否是中文
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
(cp >= 0x3400 and cp <= 0x4DBF) or #
(cp >= 0x20000 and cp <= 0x2A6DF) or #
(cp >= 0x2A700 and cp <= 0x2B73F) or #
(cp >= 0x2B740 and cp <= 0x2B81F) or #
(cp >= 0x2B820 and cp <= 0x2CEAF) or
(cp >= 0xF900 and cp <= 0xFAFF) or #
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
return True
return False
# for char in text:
# cp = ord(char)
# if self._is_chinese_char(cp)
2.6 字符串全角/半角转换
def Q2B(uchar):
"""单个字符 全角转半角"""
inside_code = ord(uchar)
if inside_code == 0x3000:
inside_code = 0x0020
else:
inside_code -= 0xfee0
if inside_code < 0x0020 or inside_code > 0x7e: # 转完之后不是半角字符返回原来的字符
return uchar
return chr(inside_code)
def B2Q(uchar):
"""单个字符 半角转全角"""
inside_code = ord(uchar)
if inside_code < 0x0020 or inside_code > 0x7e: # 不是半角字符就返回原来的字符
return uchar
if inside_code == 0x0020: # 除了空格其他的全角半角的公式为: 半角 = 全角 - 0xfee0
inside_code = 0x3000
else:
inside_code += 0xfee0
return chr(inside_code)
def Q2B_string(strs):
"""字符串 全角转半角"""
new_list = list()
for uchar in strs:
new_list.append(Q2B(uchar))
return ''.join(new_list)
2.7 判断是否是标点符号
def _is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False
2.8 去除非间距字符
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
# 这个函数去除掉text中的非间距字符
# 标准化对于任何需要以一致的方式处理Unicode文本的程序都是非常重要的。
# 当处理来自用户输入的字符串而你很难去控制编码的时候尤其如此。
# normalize() 将文本标准化,第一个参数指定字符串标准化的方式,NFD表示字符应该分解为多个组合字符表示
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
# category() 返回字符在UNICODE里分类的类型
cat = unicodedata.category(char)
if cat == "Mn":
# Mark, Nonspacing 指示字符是非间距字符,这指示基字符的修改。
# https://www.fileformat.info/info/unicode/category/Mn/list.htm
continue
output.append(char)
return "".join(output)
https://www.programcreek.com/python/example/1020/unicodedata.category
2.9 全角转半角
def strQ2B(ustring):
"""全角转半角"""
res = ""
for uchar in ustring:
inside_code = ord(uchar)
# 全角空格直接转换
if inside_code == 12288:
inside_code = 32
elif 65281 <= inside_code <= 65374:
# 全角字符(除空格)根据关系转化
inside_code -= 65248
res += chr(inside_code)
return res
2.10 判断是否全部是中文
def is_all_chinese(str_in):
for i in str_in:
if not '\u4e00' <= i <= '\u9fa5':
return False
return True
2.11 判断是否全部是英文
def is_all_eng(str_in):
for i in str_in:
if i not in string.ascii_lowercase + string.ascii_uppercase:
return False
return True
2.12 删除括号
def trim_parent_bracket(str_in):
res = re.findall(r'[(](.*?)[)]', str_in)
for ele in res:
if is_all_eng(ele):
str_in = str_in.replace('(' + ele + ')', '')
return str_in