作业3:
import re
1、非负整数
[99,100,-100,-1,90]
numbers = [99,100,-100,-1,90,0,1,]
p1 = re.compile(r’^\d+$’)
2、匹配正整数
p1 = re.compile(r’1\d*$’)
3、非正整数
p1 = re.compile(’^(0|-\d+)$’)
for i in numbers:
result = p1.search(str(i))
print(result)
4、qq邮箱:
qq号5位—14
content = ‘aaaaa2506905772@qq.comaa:2019-12-20,abbbcc13345678911agc,2018-09-11’
p2 = re.compile(r’[1-9]\d{4,13}@qq.com’)
print(p2.search(content).group())
5、匹配11位电话号码
1 3-9
p3 = re.compile(r’1[3-9]\d{9}’)
print(p3.search(content).group())
6、匹配日期:
2019-12-19
content = ‘Aaaaaa2506905772@qq.comaa:2019-12-20,abbbcc13345678911agc,2018-09-11’
p4 = re.compile(r’[1-9]\d{3}-(1[0-2]|0?[1-9])-(3[0-1]|[1-2]\d|0?[1-9])’)
m = p4.search(content)
print(m.group())
print(p4.search(content,m.end()+1))
7、长度为8-10的用户密码:
开头字母:必须大写,每一位可以是数字,字母,_
p5 = re.compile(r’[A-Z]\w{7,9}’)
print(p5.search(content))
猫眼其他字段做出来。
import re,json
import requests
def write_to_json(infos):
with open('movie.json','w',encoding='utf-8') as fp :
json.dump(infos,fp)
#解析页面内容
def parse_page(html_str):
#测试页面内容是否能拿到
# print(html_str)
#正则筛选页面的原则:一步步缩小匹配范围。
dl_p = re.compile(r'<dl class="board-wrapper">(.*?)</dl>',re.S)
dl_content = dl_p.search(html_str).group()
dd_p = re.compile(r'<dd>(.*?)</dd>',re.S)
dd_list = dd_p.findall(dl_content)
# print(dd_list)
'''
电影名称
主演
上映时间评分
详情url
'''
for dd in dd_list:
name_p = re.compile(r'" title="(.*?)" class="',re.S)
name = name_p.search(dd).group(1)
# print(name)
actor_p = re.compile(r'<p class="star">(.*?)</p>',re.S)
actor = actor_p.search(dd).group(1).strip()
# print(actor)
#设置爬取数据
item = {}
item['name'] =name
item['actor'] = actor
print(item)
infos.append(item)
#保存到json文件
#
# 一个方法干一件事
def main():
#确定基础url
base_url = 'https://maoyan.com/board/4?offset=%s'
headers= {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
}
for i in range(10):
# print(base_url %i*10)
response = requests.get(base_url %(i*10),headers=headers)
parse_page(response.text)
if __name__ == '__main__':
# infos = []
# main()#提取数据,将数据添加到infos中
#将数据写入文件
# write_to_json(infos)
#从movie.json文件中读出电影数据
infos = json.load(open('movie.json','r'))
for item in infos:
print(item)
股吧:
1,字段
阅读
评论
标题
作者
更新时间
详情页
2.10页内容保存到json文件。
import requests,re,json
#判空校验
def get_match(match,number):
if match:
return match.group(number)
return ''
def write_to_json(infos):
with open('guba.json','w',encoding='utf-8') as fp:
json.dump(infos,fp)
def parse_page(html_str):
# print(html_str)
ul_p = re.compile(r' <ul class="newlist"(.*?)</ul>',re.S)
ul_content = get_match(ul_p.search(html_str),0)
li_p = re.compile(r'<li>(.*?)</li>', re.S)
li_contents = li_p.findall(ul_content)
# print(li_content)
for li in li_contents:
#阅读
infos_p = re.compile(r'<cite>(.*?)</cite>',re.S)
infos = infos_p.findall(li)
if len(infos)==2:
read_num = infos[0].strip()
comment_num = infos[1].strip()
title_p = re.compile(r'" title="(.*?)" class=',re.S)
title = get_match(title_p.search(li),1)
# print(title)
href_p = re.compile(r' <a href="(.*?)" title="',re.S)
href = get_match(href_p.search(li),1)
# print(href)
href = 'http://guba.eastmoney.com'+href
item = {}
item['read_num'] =read_num
item['comment_num'] =comment_num
item['title'] =title
item['href'] =href
print(item)
infos.append(item)
def main():
base_url = 'http://guba.eastmoney.com/default,99_%s.html'
headers= {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
}
for page in range(1,13):
response = requests.get(base_url %page ,headers= headers)
parse_page(response.text)
if __name__ == '__main__':
infos = []
main()
write_to_json(infos)
未完待续…
1-9 ↩︎