python正则表达是截取字符
#!/usr/bin/env python
# coding=utf-8
import re
"""
MonthlyTask::statProgramByService
"""
file_name = "./pro"
result = []
with open(file_name) as fd:
for line in fd:
line = line.strip("\n")
if not line:
continue
num1 = re.search(r'\"(.*?)\s(.*?)\((.*?)\)(.*)',line)
if num1:
name = num1.group(1)
year = num1.group(3)
else:
name = "无"
year = '无'
num2 =re.search(r'(.*?)导演:(.*?)\s\s\s(.*)',line)
if num2:
director = num2.group(2)
dtemp = director.split('/')
if len(dtemp) > 3:
dtemp1 = dtemp[0:3]
director = ' / '.join(dtemp1)
else:
director = '无'
num3 =re.search(r'(.*?)编剧:(.*?)\s\s\s(.*)',line)
if num3:
screenwriter = num3.group(2)
stemp = screenwriter.split('/')
if len(stemp) > 3:
stemp1 = stemp[0:3]
screenwriter = ' / '.join(stemp1)
else:
screenwriter = '无'
num4 =re.search(r'(.*?)主演:(.*?)\s\s\s(.*)',line)
if num4:
role = num4.group(2)
roler = role.split('/')
if len(roler) > 5:
rtemp = roler[0:5]
role = ' / '.join(rtemp)
else:
role = '无'
num5 =re.search(r'(.*?)类型:(.*?)\s\s\s(.*)',line)
if num5:
type2 = num5.group(2)
ttemp = type2.split('/')
if len(ttemp) > 5:
ttemp1 = ttemp[0:4]
type2 = ' / '.join(ttemp1)
else:
type2 = '无'
num6 =re.search(r'(.*?)地区:(.*?)\s\s\s(.*)',line)
if num6:
if len(atemp) > 1:
area = atemp[0]
else:
area = '无'
num7 =re.search(r'(.*?)语言:(.*?)\s\s\s(.*)',line)
if num7:
language = num7.group(2)
else:
language = '无'
num8 =re.search(r'(.*?)集数:(.*?)\s\s\s(.*)',line)
if num8:
if '真人秀' in type2:
mold = '综艺'
elif '动画' in type2:
mold = '动画'
else:
mold = '电视剧'
else:
if '真人秀' in type2:
mold = '综艺'
elif '动画' in type2:
mold = '动画'
else:
mold = '电影'
res = "%s|%s|%s|%s|%s|%s|%s|%s|%s" % (name,year,mold,type2,director,screenwrit
er,role,area,language)
if res not in result:
result.append(res)
for item in result:
print "%s" % (item)