运行环境:
- windows 10
- python 3.6.7
- visual studio code
源代码:
import requests
import re
from requests.exceptions import RequestException
import json
import time
url='http://maoyan.com/board/4'
def get_text(url):
try:
headers={'User-Agent': 'Mozilla/5.0'}
r=requests.get(url,headers=headers)
r.raise_for_status()
r.encoding=r.apparent_encoding
text=r.text
return text
except:
return None
def get_one_page(text):
expression='<dd>.*?board-index.*?>(.*?)</i>.*?data-src="(.*?)@160w_.*?data-val.*?>(.*?)</a>'
expression=re.compile(expression,re.S) #re.S
items=re.findall(expression,text)
#print(items) #items是一个列表,每个元素都是元组
for item in items:
picture=requests.get(item[1])
root='D://pics//' #可自定义文件保存位置
path=root+'第'+item[0]+'名'+' '+item[2]+'.bmp'
with open(path,'wb') as f:
f.write(picture.content)
f.close
yield{
'排名':item[0],
'电影名':item[2]
}
def write_to_file(content):
path='D://pics//电影排名.csv'
with open(path,'a') as f:
f.write(json.dumps(content,ensure_ascii=False)+'\n')
def main(offset):
url='http://maoyan.com/board/4?offset='+str(offset)
text=get_text(url)
for item in get_one_page(text):
#print(type(item))
print("{:5} {:6}".format(item['排名'],item['电影名']))
write_to_file(item)
if __name__ == '__main__':
for i in range(10):
main(offset=i*10)
time.sleep(1)
控制台输出:
![](https://i-blog.csdnimg.cn/blog_migrate/ae91e703b32c5a2c2691cb0924989871.png)
最终效果:
![](https://i-blog.csdnimg.cn/blog_migrate/5e4577db522eaac07608d8475890bad3.png)