Phtyon3的爬虫程序-百度新闻

自己做的一个Phtyon3的爬虫程序,获取的是百度新闻的一部分,如有遗漏请指教。

注:程序可能需要运行两次才可以,问题待解决 

# -*- coding:utf-8 -*-
# @auther Lugr
# @link "http://news.baidu.com"
# @date 2019-01-31
import bs4
from bs4 import BeautifulSoup
import requests
import os,time,re

def getHtml(url):
	# html_doc=request.urlopen(url).read().decode("utf-8")
	soup = BeautifulSoup(requests.get(url).content, 'html.parser')
	# print(url)
	return soup
def save_title(filename,soup):
	ntime = time.strftime("%Y%m%d",time.localtime(time.time()))
	folder="baidu-"+ntime
	if not os.path.exists(folder):
		os.mkdir(folder)
		f=open(folder+"/"+filename,mode="w+")
		i=0
		for x in soup.find_all("a"):
			i+=1
			if(len(re.findall(r'(/.*?)',x['href']))==1):
				f.write("{}\t{}\t\n".format(x.string,url+x['href']))
				if not os.path.exists(folder+"/"+x.string):
					os.mkdir(folder+"/"+x.string)
		f.close()
	fd=open(folder+"/title.txt",mode="r+")
	for i in fd.readlines():
		fold=folder+"/"+i.split('\t')[0]
		filename="slist.txt"
		# print(i.split('\t')[1])
		s=getHtml(i.split('\t')[1])
		getTwo(fold,filename,s)
		# print(i.split('\t')[1])
	fd.close()
def getTwo(folder,filename,soup):
	con=open(folder+"/"+filename,mode="w+")
	# print(folder)
	for x in soup.find_all("a"):
		if(len(x.text)>12 and x.text not in 'None'):
			con.write("{}\t{}\t\n".format(x.string,x['href']))
			# print(folder,x.string)
	con.close()
	getContent(folder)
def getContent(folder):
	print(folder)
	text_link=open(folder+"/slist.txt",mode="r+")
	i=0
	for m in text_link.readlines():
		if(m.split("\t")[0]!="None" or m.split("\t")[1]!='javascript:void(0);'):
			str_text=setFileTitle(m.split("\t")[0])
			i+=1
			print(str(i)+"_"+str_text)
			if(os.path.exists(folder+"/"+str_text+".html")):
				continue
			else:
				try:
					d=open(folder+"/"+str_text+".html",mode="wb+")
					d.write(requests.get(m.split("\t")[1]).content)
				except Exception as e:
					print(e)
				else:
					continue
				finally:
					d.close()
				
	text_link.close()
def d(url):
	soup=getHtml(url)
	save_title("title.txt",soup)
def setFileTitle(title):    
	title=re.sub('\n','_',title)
	title=re.sub('\.','_',title)
	return re.sub('[\/:*?"<>|nt]','_',title)	#去掉非法字符  
url="http://news.baidu.com"
print("start")
d(url)
print("end")

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值