#Descargar el archivo hbase-examples-1.2.0-cdh5.10.0.jar en la ruta /home/cloudera/: wget https://repository.cloudera.com/content/repositories/releases/org/apache/hbase/hbase-examples/1.2.0-cdh5.10.0/hbase-examples-1.2.0-cdh5.10.0.jar ------------ ejemplo 1 ------------ #ejecutar en el shell de linux export SPARK_HIVE=true pyspark --master local --driver-class-path "/usr/lib/hbase/*:/usr/lib/hbase/lib/*:/home/cloudera/hbase-examples-1.2.0-cdh5.10.0.jar:/usr/lib/spark/lib/spark-examples-1.6.0-cdh5.12.0-hadoop2.6.0-cdh5.12.0.jar" #------ #código en pyspark from pyspark.sql import SQLContext sqlContext = SQLContext(sc) hTbl = sqlContext.read.format('org.apache.hadoop.hbase.spark').option('hbase.table','t').option('hbase.columns.mapping', 'KEY_FIELD STRING :key, A STRING c:a, B STRING c:b').option('hbase.use.hbase.context',False).option('hbase.config.resources', 'file:///etc/hbase/conf/hbase-site.xml').load() hTbl.show() #ejecutar comando sql sqlContext.registerDataFrameAsTable(hTbl,"t") df2 = sqlContext.sql("SELECT * FROM t") df2.show() ------------ ejemplo 2 ------------ #crear la tabla en hbase y cargar datos. #En habase ejecutar: create 'people', 'cf1' put 'people', 'userid1', 'cf1:fname', 'John' put 'people', 'userid1', 'cf1:lname', 'Doe' put 'people', 'userid1', 'cf1:age', '41' put 'people', 'userid2', 'cf1:fname', 'Jeffrey' put 'people', 'userid2', 'cf1:lname', 'Aven' put 'people', 'userid2', 'cf1:age', '46' put 'people', 'userid2', 'cf1:city', 'Hayward' #Ejecutar pyspark con los siguientes parámetros pyspark --master local --driver-class-path "/usr/lib/hbase/*:/usr/lib/hbase/lib/*:../spark-examples.jar:/home/cloudera/hbase-examples-1.2.0-cdh5.10.0.jar:/usr/lib/spark/lib/spark-examples-1.6.0-cdh5.12.0-hadoop2.6.0-cdh5.12.0.jar" #Leer los datos de la tabla HBase conf = {"hbase.zookeeper.quorum": "localhost", "hbase.mapreduce.inputtable": "people"} keyConv = "org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter" valueConv = "org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter" hbase_rdd = sc.newAPIHadoopRDD("org.apache.hadoop.hbase.mapreduce.TableInputFormat","org.apache.hadoop.hbase.io.ImmutableBytesWritable","org.apache.hadoop.hbase.client.Result",keyConverter=keyConv, valueConverter=valueConv, conf=conf) hbase_rdd.collect() #Crear un RDD de usuario y salvar el contenido a la tabla people de HBase conf2 = {"hbase.zookeeper.quorum":"localhost","hbase.mapred.outputtable":"people","mapreduce.outputformat.class":"org.apache.hadoop.hbase.mapreduce.TableOutputFormat","mapreduce.job.output.key.class":"org.apache.hadoop.hbase.io.ImmutableBytesWritable","mapreduce.job.output.value.class":"org.apache.hadoop.io.Writable"} keyConv2 = "org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter" valueConv2 = "org.apache.spark.examples.pythonconverters.StringListToPutConverter" newpeople = sc.parallelize([('userid3',['userid3','cf1','fname','NewUser'])]) newpeople.saveAsNewAPIHadoopDataset(conf=conf2,keyConverter=keyConv2,valueConverter=valueConv2) #comprobar en hbase que se insertó el valor para userid3 #ejecutar en hbase: scan 'people'