正则表达式
r'\t'使用原始字符串来表示它们,模式元素(r'\t',等价于 '\\t')匹配相应的特殊字符。
re.S 使 . 匹配包括换行在内的所有字符
(.*?) 匹配分组,(.*? )后面多个问号,代表非贪婪模式,也就是说只匹配符合条件的最少字符,分组返回的是列表
ss = re.findall(r"</strong>(.*?)<br/><br/>",strpp,re.S)
实例1:
itemleft = re.search(r'(Item\s+offered)',desc,re.I|re.S)
fitleft = re.search(r'(Fits?\s+models?\s+&\s+years?)',desc,re.I) # 获得匹配第一个字符的索引
实例2:
#将描述的多空格换成空格,|换成空格 将-前后空格去掉
desc = re.sub(r"\s+", " ", desc) #\s匹配一个空白符
desc = re.sub("\|", " ", desc)
desc = re.sub("\s*-\s*", "-", desc, re.I)
# print(desc)
desc = re.sub("(folder)|(folding)", "folding", desc, re.I) #re.I忽略大小写
desc = re.sub("(Manually)|(Manual)", "Manual", desc, re.I)
desc = re.sub("(telescopic)|(Telescoping)", "Telescoping", desc, re.I)
se = re.search(r"(Manual folding)|(Power Folding: No)|(Manual [\s\w]*? folding)", desc, re.I) #w匹配任意字母数字字符或下划线字符
替换
df = df.replace({"desc":{"\|":"\\n"}},regex=True)
df = df.replace({"fits":{"\|":"\\n"}},regex=True)
#数据库里的描述 & & 两种形式的数据都有,强制将&转换为&
df = df.replace({"description":{"\B&\B":"&"}},regex=True) # html转义字符
model = str(model).replace("&", "&") #因为model town & country的特殊性,需要特殊处理
res = "-\d{4} "+ str(model)
实例3:
a = re.search(r'(door\b)|(cab\b)|(sedan\b)|(doors\b)',fitment) #\b 边界
s_fitment = s_fitment.replace("|","\n") #换行,方便后面正则表达式匹配
实例4:
desc = str(df.loc[i,"new_description_new"]) sql转义字符
desc = desc.replace("\\","\\\\")
desc = desc.replace("\'","\\\'")
desc = desc.replace("\"","\\\"")
desc = desc.replace("\n\n","\\n")
desc = desc.replace("\r\n","\\n")
desc = desc.replace("\n","\\n")
desc = desc.replace("\t","\\t")