import requests
from requests.exceptions import RequestException
import json
import re,os
def get_one_page(url):
try:
reponse=requests.get(url)
if reponse.status_code==200:
return reponse.text
return None
except RequestException:
return None
def parse_one_page(html):
#正则表达式
pattern=re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?<a.*?>(\S+)</a>.*?<p class="star">'
'\n(.*?)</p>.*?<p class="releasetime">上映时间:(.*?)</p>.*?</dd>',re.S)
items = re.findall(pattern,html)
#print(items)
for item in items:
yield {
'index':item[0],
'title':item[1],
'star':item[2],
'time':item[3]
}
def write_to_file(content):
with open('result.txt','a',encoding='utf-8')as f:
f.write(json.dumps(content,ensure_ascii=False)+'\n')
f.close()
def file_change():
with open("result", "r", encoding="utf-8") as f1, \
open("result2", "w", encoding="utf-8") as f2:
for line in f1:
s = line.replace("\n", " ")
f2.write(s)
os.remove("result") # 删除文件
os.rename("result2", "result") # 重命名文件
def main(offset):
url ='https://maoyan.com/board/4?offset='+str(offset)
html = get_one_page(url)
print(html)
for item in parse_one_page(html):
print(item)
write_to_file(item)
if __name__== '__main__':
for i in range(10):
main(i*10)
# file_change()
python爬取猫眼top100电影信息
最新推荐文章于 2024-05-14 14:07:38 发布