自己做的一个Phtyon3的爬虫程序,获取的是百度新闻的一部分,如有遗漏请指教。
注:程序可能需要运行两次才可以,问题待解决
# -*- coding:utf-8 -*-
# @auther Lugr
# @link "http://news.baidu.com"
# @date 2019-01-31
import bs4
from bs4 import BeautifulSoup
import requests
import os,time,re
def getHtml(url):
# html_doc=request.urlopen(url).read().decode("utf-8")
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
# print(url)
return soup
def save_title(filename,soup):
ntime = time.strftime("%Y%m%d",time.localtime(time.time()))
folder="baidu-"+ntime
if not os.path.exists(folder):
os.mkdir(folder)
f=open(folder+"/"+filename,mode="w+")
i=0
for x in soup.find_all("a"):
i+=1
if(len(re.findall(r'(/.*?)',x['href']))==1):
f.write("{}\t{}\t\n".format(x.string,url+x['href']))
if not os.path.exists(folder+"/"+x.string):
os.mkdir(folder+"/"+x.string)
f.close()
fd=open(folder+"/title.txt",mode="r+")
for i in fd.readlines():
fold=folder+"/"+i.split('\t')[0]
filename="slist.txt"
# print(i.split('\t')[1])
s=getHtml(i.split('\t')[1])
getTwo(fold,filename,s)
# print(i.split('\t')[1])
fd.close()
def getTwo(folder,filename,soup):
con=open(folder+"/"+filename,mode="w+")
# print(folder)
for x in soup.find_all("a"):
if(len(x.text)>12 and x.text not in 'None'):
con.write("{}\t{}\t\n".format(x.string,x['href']))
# print(folder,x.string)
con.close()
getContent(folder)
def getContent(folder):
print(folder)
text_link=open(folder+"/slist.txt",mode="r+")
i=0
for m in text_link.readlines():
if(m.split("\t")[0]!="None" or m.split("\t")[1]!='javascript:void(0);'):
str_text=setFileTitle(m.split("\t")[0])
i+=1
print(str(i)+"_"+str_text)
if(os.path.exists(folder+"/"+str_text+".html")):
continue
else:
try:
d=open(folder+"/"+str_text+".html",mode="wb+")
d.write(requests.get(m.split("\t")[1]).content)
except Exception as e:
print(e)
else:
continue
finally:
d.close()
text_link.close()
def d(url):
soup=getHtml(url)
save_title("title.txt",soup)
def setFileTitle(title):
title=re.sub('\n','_',title)
title=re.sub('\.','_',title)
return re.sub('[\/:*?"<>|nt]','_',title) #去掉非法字符
url="http://news.baidu.com"
print("start")
d(url)
print("end")