爬虫模型作业代码笔记

import requests
from lxml import etree

class Content:
    """
    Common base class for all articles/pages
    """
    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body

    def print(self):
        """
        Flexible printing function controls output
        """
        print('URL: {}'.format(self.url))
        print('TITLE: {}'.format(self.title))
        print('BODY:\n{}'.format(self.body))

用的是Xpath定位

class Website:
    """ 
    Contains information about website structure
    """

    def __init__(self, name, url, x_path,titleTag, bodyTag):
        self.name = name
        self.url = url
        self.x_path = x_path
        self.titleTag = titleTag
        self.bodyTag = bodyTag
        

selectedElems = pageObj.xpath(‘string({})’.format(x_path))
这是因为内容页的文字是在一个父标签的各个子标签中都有,所以用‘string()’,注意,string()里的定位不需要引号

class Crawler:
    def __init__(self, site):
        self.site = site
    def getPage(self, url):
        user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
        headers={"User-Agent":user_agent}  #请求头,headers是一个字典类型
        
        html = requests.get(url,headers=headers).content
        selector = etree.HTML(html)
        return selector
    
    def get_url(self,selector,x_path):
        label = selector.xpath(x_path)
        labels = []
        urls = []
        for i in label:
            labels.append(i.attrib)
            
        for j in labels:
            try:
                urls.append(self.site.url+j['href'])
            except:
                continue
        return urls
    
    def safeGet(self, pageObj, x_path):
        """
        Utilty function used to get a content string from a Beautiful Soup
        object and a selector. Returns an empty string if no object
        is found for the given selector
        """
        
        try:
            selectedElems = pageObj.xpath('string({})'.format(x_path))
        except:
            return ''
        
        return selectedElems

    def parse(self, url):
        """
        Extract content from a given page URL
        """
        selector = self.getPage(url)
        if selector is not None:
            title = self.safeGet(selector, self.site.titleTag)
            body = self.safeGet(selector, self.site.bodyTag)
            if title != '' and body != '':
                content = Content(url, title, body)
                content.print()

    def crawl(self):
        selector = self.getPage(self.site.url)
        urls = self.get_url(selector,self.site.x_path)
        for url in urls:
            self.parse(url)
                        

                

siteData = [
    ['kjxy', 'http://kjxy.hbue.edu.cn/', '//*[(@id = "wp_news_w6")]//a', '/html/body/div[2]/div[1]/div[3]/div[2]/div/div[1]/div/div/h1','/html/body/div[2]/div[1]/div[3]/div[2]/div/div[1]/div/div/div[5]/div/div'],
    ['jrxy', 'http://jrxy.hbue.edu.cn/', '//*[(@id = "wp_news_w3")]//a','//h1', '//*[@id="entry"]/div/div'],
    ['yjs', 'http://yjs.hbue.edu.cn/', '//*[(@id = "wp_news_w3")]//a', '//h1','//*[@id="mainbody3"]/div[2]/div/div[3]/div/div[1]'],
    ['mpacc', 'http://mpacc.hbue.edu.cn/', '//*[(@id = "wp_news_w4")]//a', '/html/body/div[1]/div[5]/div[2]/div/div/div[1]','//*[@id="entry"]/div/div']
]
reuter1 = Website(siteData[0][0],siteData[0][1],siteData[0][2],siteData[0][3],siteData[0][4])

reuter2 = Website(siteData[1][0],siteData[1][1],siteData[1][2],siteData[1][3],siteData[1][4])
reuter3 = Website(siteData[2][0],siteData[2][1],siteData[2][2],siteData[2][3],siteData[2][4])
reuter4 = Website(siteData[3][0],siteData[3][1],siteData[3][2],siteData[3][3],siteData[3][4])


crawler = Crawler(reuter1)
crawler.crawl()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值