作业

兄弟连爬虫

import scrapy
from py10.items import TeacherItem # 导入数据模型
class XdlSpider(scrapy.Spider):
    name = 'xdl2'
    allowed_domains = ['itxdl.cn']
    start_urls = ['http://www.itxdl.cn/activity/teacher/teacher_lieibiao/']
    def parse(self, response):
        # 获取所有div
        # teacher_list = response.xpath('//div[@class="php_jiangshi_liebiao"]')
        # 获取所有老师div
        teacher_list = response.css('div.php_jiangshi_liebiao')

        for teacher in teacher_list:
            item = TeacherItem()
            # # 老师名称
            # xpath 写法
            # name = teacher.xpath('.//h1/text()').extract()[0] # 返回列表
            # industry = teacher.xpath('.//p/text()').extract()[0]
            # image = teacher.xpath('.//img/@src').extract()[0]
            # css 写法
            name = teacher.css('h1::text').extract()[0]
            industry = teacher.css('p::text').extract()[0]
            image = teacher.css('img::attr(src)').extract()[0]

            # 加载数据
            item['name'] = name
            item['industry'] = industry
            item['image'] = image
            # 把item交给 itempipeline (管道文件)
            yield item

pipeline
import json
class Py10Pipeline(object):
    # 管道文件需要实现的方法
    def process_item(self, item, spider):
        return item
class TeacherPipeline(object):
    def __init__(self):
        self.f = open('teacher.json','w', encoding='utf-8')
    def process_item(self, item, spider):
        # 写入文件
        item['spider'] = spider.name # 获取爬虫名称
        self.f.write(json.dumps(dict(item),ensure_ascii=False) + '\n')
        # 返回item,继续交给其它的管道文件处理
        return item
    # 爬虫结束,执行
    def close_spider(self,spider):
        self.f.close()

腾讯招聘

class TencentSpider(scrapy.Spider):
    name = 'tencent' # 爬虫名称
    allowed_domains = ['tencent.com'] # 有效域
    start_urls = ['http://hr.tencent.com']
    base_url = 'http://hr.tencent.com/position.php?&start=%d'

    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }
    # 针对这个爬虫的配置文件
    custom_settings = {
        'ITEM_PIPELINES' : {
            'day12.pipelines.TencentPipeline': 1,
        },
        'DOWNLOAD_DELAY' : 0,
        'CONCURRENT_REQUESTS' : 32,
    }

    def parse(self, response):
        # 构建所有分页请求
        for i in range(0,2910 + 1,10):
            fullurl = self.base_url % i
            yield scrapy.Request(url=fullurl,callback=self.parseList,headers=self.headers)

    # 解析列表页
    def parseList(self,response):
        detail_urls = response.css('tr.even a::attr(href),tr.odd a::attr(href)').extract()
        for url in detail_urls:
            fullurl = 'http://hr.tencent.com/' + url
            yield scrapy.Request(url=fullurl,callback=self.parseDetail,headers=self.headers)

    # 解析详情页
    def parseDetail(self,response):
        item = TencentItem()

        title = response.xpath('//td[@id="sharetitle"]/text()').extract()[0]
        info = response.xpath('//table//tr[2]/td/text()').extract()
        location = info[0]
        p_type = info[1]
        number = info[2].strip('人')
        duty = response.xpath('//table//tr[3]//li/text()').extract()
        duty = ''.join(duty)


        requirement = response.xpath('//table//tr[4]//li/text()').extract()
        requirement = ''.join(requirement)
        item["title"] = title
        item["location"] = location
        item["p_type"] = p_type
        item["number"] = number
        item["duty"] = duty
        item["requirement"] = requirement
        # 交给管道文件
        yield item
 class TencentItem(scrapy.Item):
    title = scrapy.Field()
    location = scrapy.Field()
    p_type = scrapy.Field()
    number = scrapy.Field()
    duty = scrapy.Field()
    requirement = scrapy.Field()
 class TencentPipeline(object):
    def __init__(self):
        self.f = open('position.json','w',encoding='utf-8')
    def process_item(self,item,spider):
        self.f.write(json.dumps(dict(item), ensure_ascii=False) + '\n')
        return item
    def close_spider(self,spider):
        self.f.close()

其他的

class TaobaoItem(scrapy.Item):
    title = scrapy.Field()
    price = scrapy.Field()
    sales = scrapy.Field()
    img_url = scrapy.Field()

class StoreImagePipeline(object):
    def process_item(self, item, spider):
        img_url = item['img_url']
        img_url = 'http:' + img_url
        import requests
        response = requests.get(img_url)
        if not os.path.exists('download'):
            os.mkdir('download')
        filename = 'download/' + img_url.split('/')[-1]
        with open(filename, 'wb') as f:
            f.write(response.content)
        return item
class ImgsrcPipeline(object):
    def process_item(self, item, spider):
        img_url = item['src']
        import requests
        response = requests.get(img_url)
        if not os.path.exists('tuchong'):
            os.mkdir('tuchong')
        filename = 'tuchong/' + img_url.split('/')[-1]
        with open(filename, 'wb') as f:
            f.write(response.content)
        return item
 from scrapy.cmdline import execute
 execute('scrapy crawl bj58'.split())
 from scrapy.cmdline import execute
 execute('scrapy crawl qiushibaike'.split())
 from scrapy.cmdline import execute
 execute('scrapy crawl taobao'.split())
 from scrapy.cmdline import execute
 execute('scrapy crawl tuchong'.split())
 class Bj58Spider(scrapy.Spider):
    name = 'bj58'
    allowed_domains = ['bj.58.com']
    start_urls = ['http://bj.58.com/ershouche/?PGTID=0d100000-0000-1ce3-e602-1acefd5f07af&ClickID=4']

    def parse(self, response):

        # with open('58.html','wb') as f:
        #     f.write(response.body)
        li_list = response.xpath('//ul[@class="car_list ac_container"]/li')
        # print(li_list)
        for li_ele in li_list:
            title = li_ele.xpath('./div[@class="col col2"]/a/h1//text()').extract()
            title = ''.join(title).strip()
            # print(title)
            price = li_ele.xpath('./div[@class="col col3"]/h3/text()').extract_first()
            # print(price)
            param = li_ele.xpath('./div[@class="col col2"]/div[@class="info_param"]/span/text()').extract()
            param = ''.join(param)
            # print(param)
            tags = li_ele.xpath('./div[@class="col col2"]/div[@class="info_tags"]/div//text()').extract()
            tags = ''.join(tags).strip()
            print('=========='*30)
import scrapy


class QiushibaikeSpider(scrapy.Spider):
    name = 'qiushibaike'
    allowed_domains = ['qiushibaike.com']
    start_urls = ['https://www.qiushibaike.com/']

    def parse(self, response):
        # with open('qiushi.html','wb') as f:
        #     f.write(response.body)
        authors = response.xpath('//div[@class="author clearfix"]/a[2]/h2/text()').extract()
        print(len(authors))

        for i in range(0,len(authors)):
            author = authors[i]

# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy_project.items import TaobaoItem
class TaobaoSpider(scrapy.Spider):
    name = 'taobao'
    allowed_domains = ['taobao.com']
    start_urls = ['https://s.taobao.com/list?spm=a219r.lm5734.0.0.478ae18amAPaDS&q=%E6%96%87%E8%83%B8%E5%A5%97%E8%A3%85&abver=old&input_query=%E6%96%87%E8%83%B8&suggest_offset=0&from=suggest&cat=1625&seller_type=taobao&style=list']

    def parse(self, response):
        # with open('taobao.html','wb') as f:
        #     f.write(response.body)
        base_url = 'https://s.taobao.com/list?spm=a219r.lm5734.0.0.478ae18amAPaDS&q=%E6%96%87%E8%83%B8%E5%A5%97%E8%A3%85&abver=old&input_query=%E6%96%87%E8%83%B8&suggest_offset=0&from=suggest&cat=1625&seller_type=taobao&style=list&bcoffset=0&s={}'
        # s = 0
        for i in range(1,11):
            s = (i-1)*60
            url = base_url.format(s)
            # s += 60
            # print(url)
            yield scrapy.Request(url,callback=self.parse_detail)
    def parse_detail(self, response):
        html = response.text
        titles = re.findall('"raw_title":"(.*?)"', html)
        # print(len(titles))
        prices = re.findall('"view_price":"(.*?)"', html)
        # print(len(prices))
        sales = re.findall('"view_sales":"(.*?)"', html)
        # print(len(saless))
        img_urls = re.findall('"pic_url":"(.*?)"', html)
        # print(len(image_urls))
        for i in range(0, len(titles)):
            item = TaobaoItem()
            item['title'] = titles[i]
            item['price'] = prices[i]
            item['sales'] = sales[i]
            item['img_url'] = img_urls[i]
            yield item
import scrapy
import json
from scrapy_project.items import TuchongItem
class TuchongSpider(scrapy.Spider):
    name = 'tuchong'
    allowed_domains = ['tuchong.com']
    start_urls = ['https://tuchong.com/rest/tags/%E7%BE%8E%E5%A5%B3/posts?page=1&count=20&order=weekly']

    def parse(self, response):
        res_dict = json.loads(response.text)
        for res in res_dict['postList']:
            url = res['url']
            # title = res['title']
            # excerpt = res['excerpt']
            # print(url)

            yield scrapy.Request(url,callback=self.get_detail)
    def get_detail(self,response):
        # with open('tuchong.html','wb') as f:
        #     f.write(response.body)
        srcs = response.xpath('//article[@class="post-content"]/img/@src').extract()
        for i in range(0,len(srcs)):
            item = TuchongItem()
            item['src'] = srcs[i]
            # print(srcs[i])

            yield item
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
信息数据从传统到当代,是一直在变革当中,突如其来的互联网让传统的信息管理看到了革命性的曙光,因为传统信息管理从时效性,还是安全性,还是可操作性等各个方面来讲,遇到了互联网时代才发现能补上自古以来的短板,有效的提升管理的效率和业务水平。传统的管理模式,时间越久管理的内容越多,也需要更多的人来对数据进行整理,并且数据的汇总查询方面效率也是极其的低下,并且数据安全方面永远不会保证安全性能。结合数据内容管理的种种缺点,在互联网时代都可以得到有效的补充。结合先进的互联网技术,开发符合需求的软件,让数据内容管理不管是从录入的及时性,查看的及时性还是汇总分析的及时性,都能让正确率达到最高,管理更加的科学和便捷。本次开发的高校科研信息管理系统实现了操作日志管理、字典管理、反馈管理、公告管理、科研成果管理、科研项目管理、通知管理、学术活动管理、学院部门管理、科研人员管理、管理员管理等功能。系统用到了关系型数据库中王者MySql作为系统的数据库,有效的对数据进行安全的存储,有效的备份,对数据可靠性方面得到了保证。并且程序也具备程序需求的所有功能,使得操作性还是安全性都大大提高,让高校科研信息管理系统更能从理念走到现实,确确实实的让人们提升信息处理效率。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值