题目要求:
爬取这个网站:http://maoyan.com/board/4?offset=0 上TOP100电影的①电影名②演员③日期④图片 并写入数据库。
代码如下:
#!/usr/bin/env python # coding:utf-8 import re import urllib from urllib import * from urllib import request import pymysql url = r'http://maoyan.com/board/4?offset=' def get_content(url): with request.urlopen(url) as f: content = f.read().decode('utf-8').replace(' ', '') return content def create_url(url): url_li = [] for i in range(0, 100, 10): newurl = url + '%d' % i url_li.append(newurl) return url_li def get_film(content): # content = get_content(url) pattern = r'<pclass="name"><ahref=".*"title="(.*)"data-act="boarditem-click"data-val="{movieId:.*}">' return re.findall(pattern, content) def get_date(content): # content = get_content(url) pattern = r'<pclass="releasetime">上映时间:(.*)\(?.*\)?</p></div>' return re.findall(pattern, content) def get_act(content): # content = get_content(url) pattern = r'主演:(.+)' return re.findall(pattern, content) def get_purl(content): # content = get_content(url) pattern = r'<imgdata-src="(.+)"alt=".*"class="board-img"/>' return re.findall(pattern, content) if __name__ == '__main__': # urls = create_url(url) # purls = [] # with open('E:\\wenjian.txt','a+') as f: # for url in urls: # f.write(get_content(url)) with open('E:\\wenjian.txt','r') as f: neirong = f.read() films = get_film(neirong) acts = get_act(neirong) dates = get_date(neirong) print(dates) for i in range(0,len(dates)-1): if '(' in dates[i]: dates[i] = dates[i].replace(re.findall(r'\(.+\)',dates[i])[0],'') print(dates[12]+'-01-01') for i in range(0,len(dates)-1):