代码部分
# -*- codeing = utf-8 -*- # @Time :2021/8/5 17:46 # @Author:Sarah # @File : spider.py # @Software: PyCharm from bs4 import BeautifulSoup import re import urllib.request,urllib.error import sqlite3 def main(): baseurl = "https://movie.douban.com/top250?start=" # 1爬取网页 # 获取 datalist = getData(baseurl) savepath = ".\\豆瓣电影top250.xls" #saveData(savepath) #保存 #3保存数据 #2爬取网页并解析 def getData(baseurl): datalist = [] for i in range(0,10): url = baseurl + str(i*25) html = askURL(url) #保存获取到的网页源码 soup = BeautifulSoup(html,'html.parser') for item in soup.findAll('div', class_='item'): # 查找符合要求的字符串 print(item) return datalist #得到指定一个URL的网页内容 def askURL(url): head = { "User-Agent":"Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 92.0 .4515 .131 Safari / 537.36" } request = urllib.request.Request(url,headers=head) html = '' try: #接受到的封装对象 response = urllib.request.urlopen(request) html = response.read().decode('utf-8') #print(html) except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason) return html if __name__ == '__main__': main() #逐一解析数据 # 2解析数据 # 保存数据 def saveData(savepath): print("保存好了")