我这里处理场景是推文,其他平台应该需要有所调整
处理表情:
import re
def filter_emoji(content):
try:
cont = re.compile(u'['u'\U0001F300-\U0001F64F' u'\U0001F680-\U0001F6FF'u'\u2600-\u2B55]+')
except re.error:
cont = re.compile(u'('u'\ud83c[\udf00-\udfff]|'u'\ud83d[\udc00-\ude4f\ude80-\udeff]|'u'[\u2600-\u2B55])+')
# return cont.sub (u'', content)
# 过滤表情
try:
# 自定义添加表情
cont2 = re.compile(u'[\U00010000-\U0010ffff]|[❤]|[☕]|[✌]|[♂]|[♀]|[☹]|[️]|[♥]|[️]|[✊]|[★]|[■]'
u'[⬆]|[➡]|[➡]|[⚽]|[⚠]|[▶]|[▶]|[⏺]|[⛈]|[❄]|[❌]|[☀]|[☺]|[✌]]|[‼]]|[♥]]')
except re.error:
cont2 = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
emojis = list(set(cont.findall(content)+cont2.findall(content)))
return emojis
处理链接:
import re
# 本函数输入字符串,匹配网址,返回匹配的网址列表
def parse_urls(string):
pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
url = re.findall(pattern,string)
# 自定义当前平台常见的图片url和特有url进一步匹配
pattern2 = re.compile('pic.twitter.com/(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
url2 = re.findall(pattern2,string)
return url+url2
处理at@:
import re
def parse_ats(string):
ats = re.findall(r'@([\u4e00-\u9fa5\w\-]+)',string)
return ats
处理topic#(根据平台调整):
import re
def parse_topics(string):
topics = re.findall(r"([#].*?)\s",string)
topics2 = re.findall(r"(^[#]\w+\S)\.",string)
topics3 = re.findall(r"#\w+\S",string)
fullist = topics + topics2 + topics3
for i in range(0,len(fullist)):
fullist[i] = fullist[i].replace('.', '')
topic_sets = set(fullist)
return list(topic_sets)