import requests
from bs4 import BeautifulSoup
import re
#it之家定向爬取500+条新闻
#以下是rebots协议
#User-Agent: Baiduspider
#Disallow: /tags/
#Disallow: /tag/
#Disallow: /ithome/
#Disallow: /keywords/
#Disallow: /search/
#Disallow:/tag/adt_all*
#Disallow: /comment/
#Disallow: /*?*
#Disallow: /?*
#Disallow: /html/zixun/
#User-Agent: *
#Disallow: /ithome/
#Disallow: /keywords/
#Disallow: /search/
#Disallow: /comment/
#Disallow: /*?*
#Disallow: /?*
#Disallow: /html/zixun/
#定义一个文章类
class Essay:
#文章名称
__essayName = ""
#文章作者
__essayAuthor = ""
#文章分类
__sort = ""
#文章责编
__editor = ""
#发布时间
__releaseTime = ""
def setEssayName(self,name):
self.__essayName = name
def getEssayName(self):
return self.__essayName
def setEssayAuthor(self,author):
self.__essayAuthor = author
def getEssayAuthor(self):
return self.__essayAuthor
def setSort(self,sort):
self.__sort = sort
def getSort(self):
return self.__sort
def setEditor(self,editor):
self.__editor = editor
def getEditor(self):
return self.__editor
def setReleaseTime(self,releaseTime):
self.__releaseTime = releaseTime
def getReleaseTime(self):
return self.__releaseTime
#获取url的内容
def getHtmlText(url):
try:
kv = {"user-agent":"Chrome/10.0"}
r = requests.get(url, headers=kv,timeout=30)
r.raise_for_status
r.encoding='utf-8' #通过分析网页确认网页编码为utf-8
return r.text
except:
return "error"
#获取当前页面内容内的所有指定的url(re:http://www.ithome.com/html/((?!zixun).)*/\d{6}.htm)
def getCurrentPageUrl(text,p):
set = {""}
soup = BeautifulSoup(text,"html.parser")
for a in soup.find_all(href=p):
set.add(a.get('href'))
return set
#获取当前页面文章数据
def getCurrentPageData(text):
soup = BeautifulSoup(text,"html.parser")
obj = Essay()
#发布时间
time = soup.find(id="pubtime_baidu")
if not isinstance(time,type(soup.find("asdfsadfasdf"))):
obj.setReleaseTime(time.string)
#文章名称
name = soup.find(attrs={"class":"post_title"}) if not isinstance(soup.find(attrs={"class":"post_title"}),type(soup.find("asdfsadfasdf"))) else ""
if not name == "":
obj.setEssayName(name.find("h1").string)
#文章作者
author = soup.find(id="author_baidu") if not isinstance(soup.find(id="author_baidu"),type(soup.find("asdfsadfasdf"))) else ""
if not author == "":
obj.setEssayAuthor(author.find("strong").string)
#文章分类
sort = soup.find("div",attrs={"class":"current_nav"}) if not isinstance(soup.find("div",attrs={"class":"current_nav"}),type(soup.find("asdfsadfasdf"))) else ""
if not sort == "":
nav_a = (sort.find_all("a"))
obj.setSort(nav_a[len(nav_a)-1].string)
#文章责编
editor = soup.find(id="editor_baidu") if not isinstance(soup.find(id="editor_baidu"),type(soup.find("asdfsadfasdf"))) else ""
if not editor == "":
obj.setEditor(author.find("strong").string)
return obj
def printData(obj):
print("Name:%10s Author:%10s Sort:%10s Editor:%10s Time:%10s"%(obj.getEssayName(),obj.getEssayAuthor(),obj.getSort(),obj.getEditor(),obj.getReleaseTime()))
def main():
urlList = []
dataList = []
p = re.compile(r"http://www.ithome.com/html/((?!zixun).)*/\d{6}.htm")
s = getCurrentPageUrl(getHtmlText("http://www.ithome.com/"),p)
urlList = list(s)
if len(s) < 500:
for i in range(0,len(s)):
res = getCurrentPageUrl(getHtmlText(urlList[i]),p)
urlList += list(res)
if len(urlList ) > 500:
break
for url in urlList:
obj = getCurrentPageData(getHtmlText(url) )
printData(obj)
dataList.append(obj)
main()
一个简单的定向爬虫代码
最新推荐文章于 2024-06-19 17:27:45 发布