(1)下载BeautifulSoup4.x版本,下载地址:http://www.crummy.com/software/BeautifulSoup/bs4/download/
安装:cmd->进入下载的BeautifulSoup文件目录->执行:python setup.py install
(2)创建保存新闻的文件:NewsTitle.txt
(3)运行程序:python getnews.py(程序名)
# -*- coding: utf-8 -*-
import urllib
import os
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
i = 0
j = 0
# list_a = []
# def gettext(href):
# global j,list_a
# page = urllib.urlopen(href).read()
# soup = BeautifulSoup(page,from_encoding="gb18030")
# div = soup.find_all("div",class_="content")
# p_text = div[0].find_all("p")
# for p in p_text:
# fp = file("%s.txt" % list_a[j],"a")
# fp.write(' ')
# fp.write(p.get_text())
# fp.write(" \n")
# j+=1
def gethref(url): #获得所有链接
global i,list_a
fp = file("NewsTitle.txt","w+")
page = urllib.urlopen(url).read()
soup = BeautifulSoup(page,from_encoding="gb18030")
div = soup.find_all("div",class_="ct_t_01")
h1 = div[0].findall("h1")
for lia in h1:
# list_a.append(("%s、" % (i+1))+lia.h3.get_text())
#href = lia.a.get('href')
# 将标题简介和链接有规则的写入文件中
fp.write("%s、" % (i+1))
i+=1
#fp.write("标题:")
fp.write(lia.get_text())
#fp.write("\n 简介:")
#fp.write(lia.p.get_text())
#fp.write("\n 链接:")
#fp.write(lia.a.get("href"))
fp.write("\n")
#gettext(href)
if "__main__"==__name__:
url ="http://news.sina.com.cn/"
gethref(url)
print "All Is OK!"