hbase java api操作导入数据

使用hbase存储名人数据集,数据集由名人文字信息以及名人图片组成。
名人文字信息使用scrapy框架从wiki百科上爬取并保存在csv格式中。
图片信息从百度图片上爬取每人30张保存在以该名人姓名命名的文件夹中
因此本文包含以下几个方面:
- 爬取文本的爬虫
- 爬取图片的爬虫
- 将数据导入hbase


scrapy 爬取wiki百科

首先新建scrapy项目
items.py配置
然后在settings.py文件中加入

FEED_URI = u'file:///F:/pySpace/celebrity/info1.csv'
FEED_FORMAT = 'CSV'

即以csv格式保存爬取数据以及文件保存位置

在main.py文件中加入

import sys
reload(sys)
sys.setdefaultencoding('utf-8')
sys.getdefaultencoding()
from scrapy import cmdline
cmdline.execute("scrapy crawl celebrity".split())
<python>
# -*- coding: utf-8 -*-
from scrapy.spiders import CrawlSpider
from scrapy.selector import Selector
from celebrity.items import CelebrityItem
from scrapy.http import Request
import pandas as pd
#读取待爬取的名人姓名列表
with open(r'F:\pySpace\celebrity\name_lists1.txt','r') as f:
    url_list = f.read()
url_list = url_list.split('\n')

class Celebrity(CrawlSpider):
    len_url = len(url_list)
    num =1
    name = "celebrity"
    front_url = 'https://zh.wikipedia.org/wiki/'
    start_urls = [front_url + url_list[num].encode('utf-8')]

    def parse(self, response):
        item = CelebrityItem()
        selector = Selector(response)
        body = selector.xpath('//*[@id="mw-content-text"]')[0]
        Title = body.xpath('//span[@class="mw-headline"]/text()').extract()
        titles = ['简介']
        for i in range(len(Title)):
            if Title[i] != '参考文献' and Title[i] != '注释' and Title[i] != '外部链接' and Title[i] != '参考资料':
                titles.append(Title[i])
        Passage = selector.xpath('//*[@id="mw-content-text"]/p')
        all_info = []
        for eachPassage in Passage:
            info =''.join(eachPassage.xpath('.//text()').extract())
            if info!= '':
                all_info.append(info.strip())
        Ul_list = selector.xpath('//*[@id="mw-content-text"]/ul')
        for eachul in Ul_list:
            info = ''.join(eachul.xpath('.//text()').extract())
            if info != '' and info!= '\n' and info != ' ':
                all_info.append(info)

        # 爬取带标题的
        k = 0
        epoch = len(all_info) / len(titles)
        i=0
        if epoch >0:
            for i  in range(len(titles)):

                if i == len(titles)-1:
                    item['name'] = url_list[self.num].encode('utf-8')
                    item['title'] = titles[i]
                    item['info'] = ''.join(all_info[k:])
                else :
                    item['name'] = url_list[self.num].encode('utf-8')
                    item['title'] = titles[i]
                    item['info'] = ''.join(all_info[k:k+epoch])
                    k = k+epoch
                yield item
        else :
            for j in range(len(all_info)):
                item['name'] = url_list[self.num].encode('utf-8')
                item['title'] = titles[j]
                item['info'] = all_info[j]
                yield item


        #爬取不带标题的
        # for j in range(len(all_info)):
        #     item['name'] = url_list[self.num].encode('utf-8')
        #     item['info'] = all_info[j]
        #     yield item

        print item['name']
        self.num = self.num + 1
        print self.num
        if self.num < self.len_url:
            nextUrl =self.front_url + url_list[self.num].encode('utf-8')
            yield Request(nextUrl,callback=self.parse)
</python>

爬取图片

import urllib2
import re
import os
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

def img_spider(name_file):

    user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"
    headers = {'User-Agent':user_agent}

    with open(name_file) as f:
        name_list = [name.rstrip().decode('utf-8') for name in f.readlines()]
        f.close()

    for name in name_list:
        if not os.path.exists('F:/pySpace/celebrity/img_data/' + name):
            os.makedirs('F:/pySpace/celebrity/img_data/' + name)
            try:
                url = "http://image.baidu.com/search/avatarjson?tn=resultjsonavatarnew&ie=utf-8&word=" + name.replace(' ','%20') + "&cg=girl&rn=60&pn=60"
                req = urllib2.Request(url, headers=headers)
                res = urllib2.urlopen(req)
                page = res.read()
                #print page
                img_srcs = re.findall('"objURL":"(.*?)"', page, re.S)
                print name,len(img_srcs)
            except:
                print name," error:"
                continue
            j = 1
            src_txt = ''

            for src in img_srcs:
                with open('F:/pySpace/celebrity/img_data/' + name + '/' + str(j)+'.jpg','wb') as p:
                    try:
                        print "downloading No.%d"%j
                        req = urllib2.Request(src, headers=headers)
                        img = urllib2.urlopen(src,timeout=3)
                        p.write(img.read())
                    except:
                        print "No.%d error:"%j
                        p.close()
                        continue
                    p.close()
                src_txt = src_txt + src + '\n'
                if j==30:
                    break
                j = j+1
            #保存src路径为txt
            with open('F:/pySpace/celebrity/img_data/' + name + '/' + name +'.txt','wb') as p2:
                p2.write(src_txt)
                p2.close()
                print "save %s txt done"%name


if __name__ == '__main__':
    name_file = "name_lists1.txt"
    img_spider(name_file)

通过java api 将数据导入hbase

在hbase中建两个表,分别为celebrity(存储图片信息)和celebrity_info(存储文本信息)名人的姓名为rowkey。

<java>
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.client.*;
import com.csvreader.CsvReader;
import com.google.common.primitives.Chars;
import org.junit.Test;
import java.nio.charset.Charset;
import java.io.*;
import javax.swing.ImageIcon;
/**
 * Created by mxy on 2016/10/31.
 */
public class CelebrityDataBase {

    /*新建表*/
    public void createTable(String tablename)throws Exception{
        Configuration config = HBaseConfiguration.create();
        config.set("hbase.zookeeper.quorum","node4,node5,node6");
        HBaseAdmin admin = new HBaseAdmin(config);
        String table = tablename;

        if(admin.isTableAvailable(table)){
            admin.disableTable(table);
            admin.deleteTable(table);
        }else {
            HTableDescriptor t = new HTableDescriptor(table.getBytes());
            HColumnDescriptor cf1 = new HColumnDescriptor("cf1".getBytes()) ;
            cf1.setMaxVersions(10);
            t.addFamily(cf1);
            admin.createTable(t);
        }
        admin.close();
    }
    //插入数据csv格式文字数据
    public void putInfo()throws Exception{
        CsvReader r = new CsvReader("F://pySpace//celebrity//info.csv",',', Charset.forName("utf-8"));
        r.readHeaders();
        Configuration config = HBaseConfiguration.create();
        config.set("hbase.zookeeper.quorum","node4,node5,node6");
        HTable table = new HTable(config,"celebrity_info");
        while(r.readRecord()){

            System.out.println(r.get("name"));
//          String rowkey = r.get("name");
            Put put = new Put(r.get("name").getBytes());
            put.add("cf1".getBytes(),r.get("title").getBytes(),r.get("info").getBytes());
            table.put(put);

        }
        r.close();
        table.close();

    }

    //查找图片数据
    public void getImage(String celebrity_name,String img_num)throws Exception{
        Configuration config = HBaseConfiguration.create();
        config.set("hbase.zookeeper.quorum","node4,node5,node6");
        HTable table = new HTable(config,"celebrity");
        Get get = new Get(celebrity_name.getBytes());
        Result res = table.get(get);
        Cell c1 = res.getColumnLatestCell("cf1".getBytes(),img_num.getBytes());
        File file=new File("D://"+celebrity_name+img_num);//将输出的二进制流转化后的图片的路径
        FileOutputStream fos=new FileOutputStream(file);
        fos.write(c1.getValue());
        fos.flush();
        System.out.println(file.length());
        fos.close();
        table.close();
    }

    //查找文本数据
    public void getInfo(String name) throws Exception{
        Configuration config = HBaseConfiguration.create();
        config.set("hbase.zookeeper.quorum","node4,node5,node6");
        HTable table = new HTable(config,"celebrity_info");

        Get get = new Get(name.getBytes());
        Result res = table.get(get);
        Result result = table.get(get);
        for(Cell cell : result.rawCells()){
            System.out.println("rowKey:" + new String(CellUtil.cloneRow(cell))
                    + " cfName:" + new String(CellUtil.cloneFamily(cell))
                    + " qualifierName:" + new String(CellUtil.cloneQualifier(cell))
                    + " value:" + new String(CellUtil.cloneValue(cell)));
        }
        table.close();
    }

//插入图片数据
    public void putImage(String each_celebrity,String each_img)throws Exception{

        String str = null;
        Configuration config = HBaseConfiguration.create();
        config.set("hbase.zookeeper.quorum","node4,node5,node6");
        HTable table = new HTable(config,"celebrity");
        str = String.format("F://pySpace//celebrity//img_data//%s//%s",each_celebrity,each_img);
        File file = new File(str);
        int size = 0;
        size = (int)file.length();
        System.out.println(size);
        byte[] bbb = new byte[size];
        try {
            InputStream a = new FileInputStream(file);
            a.read(bbb);
//            System.out.println(bbb);
//            System.out.println(Integer.toBinaryString(bbb));
        } catch (FileNotFoundException e) {
// TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
// TODO Auto-generated catch block
            e.printStackTrace();
        }
        String rowkey = each_celebrity;
        Put put = new Put(rowkey.getBytes());
        put.add("cf1".getBytes(),each_img.getBytes(),bbb);
        table.put(put);
        table.close();

    }

    public static void main(String args[]){
        CelebrityDatabase pt = new CelebrityDatabase();
        try {
            pt.createTable("celebrity);
            pt.createTable("celebrity_info);
        } catch (Exception e) {
            e.printStackTrace();
            System.out.println("createTable error");
        }
        String root_path = "F://pySpace//celebrity//img_data";
        File file = new File(root_path);
        File[] files = file.listFiles();

        for(int i = 0;i < files.length;i++){
            String each_path = root_path +"//"+ files[i].getName();
            File celebrity_file = new File(each_path);
            File[] celebrity_files = celebrity_file.listFiles();
            System.out.println(each_path);
            for(int j = 0;j<celebrity_files.length - 1;j++){
                try {
                    pt.putImage(files[i].getName(),celebrity_files[j].getName());
                } catch (Exception e) {
                    e.printStackTrace();
                    System.out.println("putImage error");
                }
            }

        }
        //存入文字信息
        try {
            pt.putInfo();
        } catch (Exception e) {
            e.printStackTrace();
        }

        //取出图片
        try {
            pt.getImage("龔照勝","13.jpg");
        } catch (Exception e) {
            e.printStackTrace();
            System.out.println("getImage error");
        }
        //取出文字
        try {
            pt.getInfo("成龙");
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

</java>
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值