用requests、正则表达式爬取猫眼电影top100的名单
完整代码如下:
import requests
import re
def getHtmltext(url): #获得网页
try:
html = requests.get(url)
html.raise_for_status()
html.encoding = html.apparent_encoding
return html.text
except:
return None
def parsehtml(html): #提取信息
numtitle = re.findall(r'<i class=".*?">(\d+)</i>\s*<a hre' #匹配出来为列表,里面为元组
r'f=".*?" title="(.*?)" class=',html,re.M) #源代码换行用\s*来匹配
return numtitle
def printlist(List): #打印出列表
for i in List:
print("{:<6}{:<12}".format(i[0],i[1]))
def main():
print("{:<6}{:<12}".format('名次', '电影名'))
for i in range(10): #10页
url = "https://maoyan.com/board/4?offset=" + str(i * 10)
html = getHtmltext(url)
numtitle = parsehtml(html)
printlist(numtitle)
if __name__ == "__main__":
main()