一个简单的定向爬虫代码

最新推荐文章于 2024-06-19 17:27:45 发布

狮子的鱼

最新推荐文章于 2024-06-19 17:27:45 发布

阅读量713

点赞数

分类专栏： python 文章标签：爬虫 python

本文链接：https://blog.csdn.net/fish9670/article/details/68936986

版权

python 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

import requests
from bs4 import BeautifulSoup
import re

#it之家定向爬取500+条新闻

#以下是rebots协议
#User-Agent: Baiduspider
#Disallow: /tags/
#Disallow: /tag/
#Disallow: /ithome/
#Disallow: /keywords/
#Disallow: /search/
#Disallow:/tag/adt_all*
#Disallow: /comment/
#Disallow: /*?*
#Disallow: /?*
#Disallow: /html/zixun/

#User-Agent: *
#Disallow: /ithome/
#Disallow: /keywords/
#Disallow: /search/
#Disallow: /comment/
#Disallow: /*?*
#Disallow: /?*
#Disallow: /html/zixun/

#定义一个文章类
class Essay: 

    #文章名称
    __essayName = ""
    #文章作者
    __essayAuthor = ""
    #文章分类
    __sort = ""
    #文章责编
    __editor = ""
    #发布时间
    __releaseTime = ""

    def setEssayName(self,name):
        self.__essayName = name

    def getEssayName(self):
        return self.__essayName

    def setEssayAuthor(self,author):
        self.__essayAuthor = author

    def getEssayAuthor(self):
        return self.__essayAuthor 

    def setSort(self,sort):
        self.__sort = sort

    def getSort(self):
        return self.__sort

    def setEditor(self,editor):
        self.__editor = editor

    def getEditor(self):
        return self.__editor

    def setReleaseTime(self,releaseTime):
        self.__releaseTime = releaseTime

    def getReleaseTime(self):
        return self.__releaseTime




#获取url的内容
def getHtmlText(url):
    try:
        kv = {"user-agent":"Chrome/10.0"}
        r = requests.get(url, headers=kv,timeout=30)
        r.raise_for_status
        r.encoding='utf-8' #通过分析网页确认网页编码为utf-8
        return r.text
    except:
        return "error"


#获取当前页面内容内的所有指定的url(re：http://www.ithome.com/html/((?!zixun).)*/\d{6}.htm)
def getCurrentPageUrl(text,p):
    set = {""}
    soup = BeautifulSoup(text,"html.parser")
    for a in soup.find_all(href=p):
        set.add(a.get('href'))  
    return set





#获取当前页面文章数据
def getCurrentPageData(text):
    soup = BeautifulSoup(text,"html.parser")
    obj = Essay()
    #发布时间
    time = soup.find(id="pubtime_baidu")
    if not isinstance(time,type(soup.find("asdfsadfasdf"))):
        obj.setReleaseTime(time.string)
    #文章名称
    name = soup.find(attrs={"class":"post_title"}) if not isinstance(soup.find(attrs={"class":"post_title"}),type(soup.find("asdfsadfasdf"))) else ""
    if not name == "":
        obj.setEssayName(name.find("h1").string)
    #文章作者
    author = soup.find(id="author_baidu") if not isinstance(soup.find(id="author_baidu"),type(soup.find("asdfsadfasdf"))) else ""
    if not author == "":
        obj.setEssayAuthor(author.find("strong").string)
    #文章分类
    sort = soup.find("div",attrs={"class":"current_nav"}) if not isinstance(soup.find("div",attrs={"class":"current_nav"}),type(soup.find("asdfsadfasdf"))) else ""
    if not sort == "":
        nav_a = (sort.find_all("a"))
        obj.setSort(nav_a[len(nav_a)-1].string)
    #文章责编
    editor = soup.find(id="editor_baidu") if not isinstance(soup.find(id="editor_baidu"),type(soup.find("asdfsadfasdf"))) else ""
    if not editor == "":
        obj.setEditor(author.find("strong").string)

    return obj


def printData(obj):
    print("Name:%10s Author:%10s Sort:%10s Editor:%10s Time:%10s"%(obj.getEssayName(),obj.getEssayAuthor(),obj.getSort(),obj.getEditor(),obj.getReleaseTime()))

def main():
    urlList = []
    dataList = []
    p = re.compile(r"http://www.ithome.com/html/((?!zixun).)*/\d{6}.htm")
    s = getCurrentPageUrl(getHtmlText("http://www.ithome.com/"),p)
    urlList = list(s)
    if len(s) < 500:
        for i in range(0,len(s)):
            res = getCurrentPageUrl(getHtmlText(urlList[i]),p)
            urlList += list(res)
            if len(urlList ) > 500:
                break


    for url in urlList:
        obj = getCurrentPageData(getHtmlText(url) )
        printData(obj)
        dataList.append(obj)



main()