import urllib.request from bs4 import BeautifulSoup import datetime import pymysql Logtxt = "" headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} for i in range(1,99): print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Page:"+str(i)) req = urllib.request.Request('https://www.gov.cn/yaowen/liebiao/home_0.htm'+str(i)+'.shtml', headers=headers) response = urllib.request.urlopen(req) HTMLText = response.read() #print(HTMLText) BSobj = BeautifulSoup(HTMLText, "html.parser") # print(BSobj) ArticleLists = BSobj.find("ul") # print(ArticleLists) # print(ArticleLists.findAll("h3",{"class":"title"})) for h4 in ArticleLists.findAll("h4"): # print(h3.a['href'])#获取URL # print(h3.a.get_text())#获取标题 #获取二级页面的内容 ArticleReq = urllib.request.Request(h4.a['href'], headers=headers) ArticleResponse = urllib.request.urlopen(ArticleReq) ArticleHTMLText = ArticleResponse.read() # print(HTMLText) ArticleBSobj = BeautifulSoup(ArticleHTMLText, "html.parser") Article = ArticleBSobj.find("div", {"class": "mod-content"}) # print(Article.div.span.get_text()[3:].replace(".","-"))#获取时间 # print(Article.div.find("span", {"class": "pic-source"}).get_text()[3:])#获取来源 # print(Article.div.find("span", {"class": "autor-name fr"}).get_text()[3:])#获取来源 try: ArticleURL = h4.a['href'] except: ArticleURL = 'N/A' try: ArticleTitle = h4.a.get_text().replace("\"","“").replace("\'","‘") except: ArticleTitle = 'N/A' try: ArticleDatetime = Article.div.span.get_text()[3:].replace(".","-") except: ArticleDatetime = datetime.datetime.now() try: ArticleSource = Article.div.find("span", {"class": "pic-source"}).get_text()[3:] except: ArticleSource = 'N/A' try: ArticleAuthor = Article.div.find("span", {"class": "autor-name fr"}).get_text()[3:] except: ArticleAuthor = 'N/A' try: ArticleText = Article.find("div", {"class": "pic-content"}).get_text().replace("\"","“").replace("\'","‘") except: ArticleText = 'N/A' try: VideoURL = Article.find("div", {"class": "video-container"}).find("video")['src'] except: VideoURL = 'N/A' print(ArticleDatetime, ArticleURL, ArticleTitle, VideoURL) # 1.Connection Open conn = pymysql.connect(user='root', password='123456', database='website_db') # 2.Cursor Creating: cursor = conn.cursor() # 3.SQL Execution # 执行SQL语句,循环插入记录: sqlstr = 'INSERT INTO zhongguozhengfu(ARTICLE_URL,ARTICLE_TITLE,ARTICLE_DATETIME,ARTICLE_SOURCE,ARTICLE_AUTHOR,ARTICLE_TEXT,ARTICLE_VIDEO) VALUES("'+ArticleURL+'","'+ArticleTitle+'","'+ArticleDatetime+'","'+ArticleSource+'","'+ArticleAuthor+'","'+ArticleText+'","'+VideoURL+'")' # 4.Cursor Moving # 体验游标 # 执行, 游标移至当前位置 cursor.execute(sqlstr) # 提交事务: conn.commit() # 5.Connection Close # 关闭Cursor: cursor.close() # 关闭Connection: conn.close() Logtxt=Logtxt+"\n"+str(datetime.datetime.now())+"\n"+ArticleURL+"\n"+ArticleTitle+"\n" with open('zhongguozhengfuLog.txt', 'w') as f: f.write(Logtxt)