import urllib.request
from lxml import etree
import pymysql
# 获取所有的页面的HTML
def get_all_html(url):
for i in range(0,100,10):
Url=''
Url=url+str(i)
html=urllib.request.urlopen(Url).read().decode('utf-8')
yield html
# 抓取页面的详细信息
def get_info(html):
for i in html:
HTML=etree.HTML(i)
index=HTML.xpath('//*[@class="board-wrapper"]/dd/i/text()')
title=HTML.xpath('//*[@class="board-wrapper"]/dd/a/img[@class="board-img"]//@alt')
img_src=HTML.xpath('//*[@class="board-wrapper"]/dd/a/img[@class="board-img"]//@data-src')
movie_item_info=HTML.xpath('//*[@class="movie-item-info"]/p[@class="star"]/text()')
releasetime=HTML.xpath('//*[@class="movie-item-info"]/p[@class="releasetime"]/text()')
integer=HTML.xpath('//*[@class="integer"]/text()')
fraction=HTML.xpath('//*[@class="fraction"]/text()')
yield (index,title,movie_item_info,img_src,releasetime,integer,fraction)
# 数据保存
def save_file(info):
for index,title,movie_item_info,img_src,releasetime,integer,fraction in info:
yield map(lambda x,y,z,h,i,j,k:(x.strip(),y.strip(),z.strip(),h.strip(),i.strip(),j.strip()+k.strip()),index,title,movie_item_info,img_src,releasetime,integer,fraction)
def save_dabase(j):
bp=pymysql.connect("localhost","root","root","test",charset='utf8')
cour=bp.cursor()
sql='''insert into movie(index1,movie_name,actors,img,releasetime,score)values(%s,%s,%s,%s,%s,%s)'''
try:
cour.execute(sql,j)
bp.commit()
except:
bp.rollback()
print('错误')
bp.close()
# 函数回调
def fun_call(url):
html=get_all_html(url)
info=get_info(html)
for i in save_file(info):
for j in i:
save_dabase(j)
# 主函数
def main():
url='http://maoyan.com/board/4?offset='
fun_call(url)
if __name__=='__main__':
main()
# requests + re + pymysql
import requests
import re
import pymysql
from requests.exceptions import RequestException
#1.请求一个单页内容
def get_one_page(url):
try:
#构建headers
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36'}
response=requests.get(url,headers=headers)
if response.status_code==200:
#返回页面的内容
return response.text
except RequestException:
return '请求异常'
#解析html代码 (排名,影名,海报链接,主演,上映时间,评分)
def parse_one_page(html):
#不使用re.S参数只会在行内进行匹配,不会跨行匹配,遇到\n才会跳到下一行重新进行匹配
#使用re.S参数会把匹配的内容作为一个整体,将\n作为一个普通字符,进行整体匹配
pattern=re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
+'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
+'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S)
items=re.findall(pattern,html)
for item in items:
yield {
'index':item[0],
'image':item[1],
'title':item[2],
'actor':item[3].strip()[3:],
'time':item[4].strip()[5:],
'score':item[5]+item[6]
}
#将数据存入数据库
def write_to_mysql(content):
conn=pymysql.connect(host='localhost',user='root',passwd='123456',db='mao',charset='utf8')
cur=conn.cursor()
index=content['index']
image=content['image']
title=content['title']
actor=content['actor']
time=content['time']
score=content['score']
sql='insert into movie values(%s,%s,%s,%s,%s,%s)'
parm=(index,image,title,actor,time,score)
cur.execute(sql,parm)
conn.commit()
cur.close()
conn.close()
#定义主函数
def main(offset):
url='http://maoyan.com/board/4?offset='+str(offset)
html=get_one_page(url)
# print(html)
for item in parse_one_page(html):
# print(item)
write_to_mysql(item)
if __name__=='__main__':
for i in range(0,10):
main(i*10)
# 怎么样同时访问2个列表的值
list1=[(1,2,5),(3,4,2),(5,6,2)]
for i ,j,h in list1:
print(i,j,h)
list2=[2,2,5]
list3=[3,4,2]
for i ,j in zip(list2,list3):
print(i,j)
map(lambda x,y:(x,y),list2,list3)
# 去除列表重复值
print(set(list2))