Techie Talks: October 2012

Thursday 11 October 2012

Operating on HBase columns

HBase is a column oriented database which stores its contents by column rather than by row. Instead of retrieving a record or row at a time, an entire column is retrieved and thus it becomes very powerful and efficient since data analytics is usually concerned with only one field or column of a record. The access becomes much faster, and much more relevant data can be extracted from the database in a shorter period of time.

As already mentioned in my previous post, apart from the basic read, write and delete operations I have developed another set of functions to perform union, intersection on hbase tables and also use having, between and distinct clauses as we do in SQL.
Since HBase is a column oriented database i.e. retrieves an entire column at a time instead of row it becomes very powerful and efficient since data analytics is usually concerned with only one field or column of a record. And for handling columns such functions play a significant role.

The sample program below illustrate the following operations :

Obtaining all distinct entries of a column from a table
Obtaining all distinct entries of a column from a table with the number of occurance of each.
Implemention of the Having operator.
Implementing the Having operator and extracting the entire satisfying rows
Implementation of the Between operator
Implementation of the Union operator
Implementation of the Intersection operator

Program :

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.filter.Filter;
import org.apache.hadoop.hbase.filter.FilterList;
import org.apache.hadoop.hbase.filter.SingleColumnValueFilter;
import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;
import org.apache.hadoop.hbase.filter.FilterList.Operator;
import org.apache.hadoop.hbase.util.Bytes;

public class HBaseTest
{
private static HBaseConfiguration conf;
HBaseTest()
    {
        conf = new HBaseConfiguration();
        conf.addResource(new Path("/path_to_your_hbase/hbase-0.20.6/conf/hbase-site.xml"));
    }

// function to obtain distinct col entries from a table.

public Set distinct(String tableName,String colFamilyName, String colName)
   {
    Set set = new HashSet();
    ResultScanner rs=null;
    Result res = null;
    String s = null;
    try
    {
        HTable table = new HTable(conf, tableName);
        Scan scan = new Scan();
        scan.addColumn(Bytes.toBytes(colFamilyName),Bytes.toBytes(colName));
        rs = table.getScanner(scan);
        while((res=rs.next()) != null)
        {
            byte [] obtCol = res.getValue(Bytes.toBytes(colFamilyName+":"+colName));
            s = Bytes.toString(obtCol);
            set.add(s);
        }
    } catch (IOException e)
    {
        System.out.println("Exception occured in retrieving data");
    }
    finally
    {
        rs.close();
    }
    return set;
   }

// function to return distinct entries with the number of occurance of each.

public HashMap distinctWithOccurances(String tableName,String colFamilyName, String colName)
   {
    HashMap map = new HashMap();
    ResultScanner rs=null;
    Result res = null;
    try
    {
        HTable table = new HTable(conf, tableName);
        Scan scan = new Scan();
        rs = table.getScanner(scan);
        String s = null;
        while((res=rs.next()) != null)
        {
            int noofOccurance = 0;
            int count=0;
            byte [] obtCol = res.getValue(Bytes.toBytes(colFamilyName+":"+colName));
            s = Bytes.toString(obtCol);
            Set set = map.keySet();
            Iterator iterator = set.iterator();
            if(iterator != null)
            {
            while(iterator.hasNext() && count==0)
            {
                String colEntry = (String) iterator.next();
                if(colEntry.equals(s))
                {
                noofOccurance = map.get(colEntry);
                int newNoofOccurance = noofOccurance + 1;
                map.put(s,newNoofOccurance);
                count++;
                }
            }
            }
            if(count == 0)
            {
                map.put(s,1);
            }
        }
    } catch (IOException e)
    {
        System.out.println("Exception occured in retrieving data");
    }
    finally
    {
        rs.close();
    }
    return map;
   }

// function implementing having clause.

public ArrayList> having(String tableName,String colFamilyName, String [] colName,String havingColName,String value)
{
    ResultScanner rs=null;
    ArrayList> al = new ArrayList>();
    Result res = null;
    try
    {
        HTable table = new HTable(conf, tableName);
        Scan scan = new Scan();
        SingleColumnValueFilter singleColumnValueFilterA = new SingleColumnValueFilter(
                Bytes.toBytes(colFamilyName), Bytes.toBytes(havingColName), CompareOp.EQUAL, Bytes.toBytes(value));
              singleColumnValueFilterA.setFilterIfMissing(true);
                FilterList filter = new FilterList(Operator.MUST_PASS_ALL, Arrays
                           .asList((Filter) singleColumnValueFilterA));
                scan.setFilter(filter);
        rs = table.getScanner(scan);
        while((res=rs.next()) != null)
        {
            HashMap map = new HashMap();
            String s = null;
            for(int j=0 ; j < colName.length ; j++)
            {
            byte[] obtainedRow = res.getValue(Bytes.toBytes(colFamilyName),Bytes.toBytes(colName[j]));
            System.out.println(colName[j]);
            s = Bytes.toString(obtainedRow);
            map.put(colName[j],s);
        }
        al.add(map);
        }
    } catch (IOException e)
        {
            System.out.println("Exception occured in retrieving data");
        }
    finally
    {
        rs.close();
    }
        return al;
   }

// function implementing having clause and extracting the entire rows.

public ArrayList> havingWithEntireRow(String tableName,String colFamilyName[], String [][] colName,String havingColFamilyName,String havingColName,String value)
{
    ResultScanner rs=null;
    ArrayList> al = new ArrayList>();
    Result res = null;
    try
    {
        HTable table = new HTable(conf, tableName);
        Scan scan = new Scan();
        SingleColumnValueFilter singleColumnValueFilterA = new SingleColumnValueFilter(
                    Bytes.toBytes(havingColFamilyName), Bytes.toBytes(havingColName), CompareOp.EQUAL, Bytes.toBytes(value));
            singleColumnValueFilterA.setFilterIfMissing(true);
            FilterList filter = new FilterList(Operator.MUST_PASS_ALL, Arrays
                           .asList((Filter) singleColumnValueFilterA));
                scan.setFilter(filter);
            rs = table.getScanner(scan);
        while((res=rs.next()) != null)
        {
            HashMap map = new HashMap();
            String s = null;
            for(int i=0 ; i< colFamilyName.length ; i++)
            {
            for(int j=0 ; j < colName[i].length ; j++)
            {
            byte[] obtainedRow = res.getValue(Bytes.toBytes(colFamilyName[i]),Bytes.toBytes(colName[i][j]));
            s = Bytes.toString(obtainedRow);
            map.put(colName[i][j],s);
            }
            }
            al.add(map);
        }
    } catch (IOException e)
        {
            System.out.println("Exception occured in retrieving data");
        }
    finally
    {
        rs.close();
    }
        return al;
    }

// function implementing the between clause.

public ArrayList> between(String tableName,String colFamilyName, String [] colName,String betweenColName,String lowerValue,String upperValue)
   {
    ResultScanner rs=null;
    ArrayList> al = new ArrayList>();
    Result res = null;
    try
    {
        HTable table = new HTable(conf, tableName);
        Scan scan = new Scan();
        SingleColumnValueFilter singleColumnValueFilterA = new SingleColumnValueFilter(
                Bytes.toBytes(colFamilyName), Bytes.toBytes(betweenColName), CompareOp.GREATER, Bytes.toBytes(lowerValue));
                singleColumnValueFilterA.setFilterIfMissing(true);
        SingleColumnValueFilter singleColumnValueFilterB = new SingleColumnValueFilter(
                    Bytes.toBytes(colFamilyName), Bytes.toBytes(betweenColName), CompareOp.LESS_OR_EQUAL, Bytes.toBytes(upperValue));
        singleColumnValueFilterB.setFilterIfMissing(true);
        FilterList filter = new FilterList(Operator.MUST_PASS_ALL, Arrays.asList((Filter) singleColumnValueFilterA,
                                    singleColumnValueFilterB));
                scan.setFilter(filter);
        rs = table.getScanner(scan);
        while((res=rs.next()) != null)
        {
            HashMap map = new HashMap();
            String s = null;
            for(int j=0 ; j < colName.length ; j++)
            {
            byte[] obtainedRow = res.getValue(Bytes.toBytes(colFamilyName),Bytes.toBytes(colName[j]));
            s = Bytes.toString(obtainedRow);
            map.put(colName[j],s);
            }
            al.add(map);
        }
    } catch (IOException e)
        {
            System.out.println("Exception occured in retrieving data");
        }
    finally
    {
        rs.close();
    }
            return al;
   }

// function implementing union.

public ArrayList> union(String tableName,String colFamilyName1, String colFamilyName2,String [] colNames1,String [] colNames2, String colName1, String colName2,String value1,String value2)
   {
    ResultScanner rs=null;
    ArrayList> al = new ArrayList>();
    Result res = null;
    try
    {
        HTable table = new HTable(conf, tableName);
        Scan scan = new Scan();
        SingleColumnValueFilter singleColumnValueFilterA = new SingleColumnValueFilter(
                Bytes.toBytes(colFamilyName1), Bytes.toBytes(colName1), CompareOp.EQUAL, Bytes.toBytes(value1));
                singleColumnValueFilterA.setFilterIfMissing(true);

            SingleColumnValueFilter singleColumnValueFilterB = new SingleColumnValueFilter(
                    Bytes.toBytes(colFamilyName2), Bytes.toBytes(colName2), CompareOp.EQUAL, Bytes.toBytes(value2));
        singleColumnValueFilterB.setFilterIfMissing(true);

        FilterList filter = new FilterList(Operator.MUST_PASS_ONE, Arrays.asList((Filter) singleColumnValueFilterA,
                                    singleColumnValueFilterB));

               scan.setFilter(filter);
            rs = table.getScanner(scan);
        if(colFamilyName1.equals(colFamilyName2))
        {
        while((res=rs.next()) != null)
        {
            HashMap map = new HashMap();
            String s = null;
            for(int j=0 ; j < colNames1.length ; j++)
            {
            byte[] obtainedRow = res.getValue(Bytes.toBytes(colFamilyName1),Bytes.toBytes(colNames1[j]));
            System.out.println(colNames1[j]);
            s = Bytes.toString(obtainedRow);
            System.out.println(s);
            map.put(colNames1[j],s);
            }
            al.add(map);
        }
        }
        else
        {
            while((res=rs.next()) != null)
            {
                HashMap map = new HashMap();
                String s = null;
                // extract row of the first col family
                for(int j=0 ; j < colNames1.length ; j++)
                {
                byte[] obtainedRow = res.getValue(Bytes.toBytes(colFamilyName1),Bytes.toBytes(colNames1[j]));
                s = Bytes.toString(obtainedRow);
                map.put(colNames1[j],s);
                }
                // extract row of the second col family
                for(int k=0 ; k < colNames2.length ; k++)
                {
                byte[] obtainedRow = res.getValue(Bytes.toBytes(colFamilyName2),Bytes.toBytes(colNames2[k]));
                s = Bytes.toString(obtainedRow);
                map.put(colNames2[k],s);
                }
                // put both in the arraylist
                al.add(map);
            }
        }
    } catch (IOException e)
        {
            System.out.println("Exception occured in retrieving data");
        }
    finally
    {
        rs.close();
    }
    return al;
   }

// function implementing intersection.

public ArrayList> intersection(String tableName,String colFamilyName1, String colFamilyName2,String [] colNames1,String [] colNames2, String colName1, String colName2,String value1,String value2)
{
    ResultScanner rs=null;
    ArrayList> al = new ArrayList>();
    Result res = null;
    try
    {
        HTable table = new HTable(conf, tableName);
        Scan scan = new Scan();
        SingleColumnValueFilter singleColumnValueFilterA = new SingleColumnValueFilter(
                    Bytes.toBytes(colFamilyName1), Bytes.toBytes(colName1), CompareOp.EQUAL, Bytes.toBytes(value1));
            singleColumnValueFilterA.setFilterIfMissing(true);
        SingleColumnValueFilter singleColumnValueFilterB = new SingleColumnValueFilter(
                Bytes.toBytes(colFamilyName2), Bytes.toBytes(colName2), CompareOp.EQUAL, Bytes.toBytes(value2));
                singleColumnValueFilterB.setFilterIfMissing(true);

        FilterList filter = new FilterList(Operator.MUST_PASS_ALL, Arrays.asList((Filter) singleColumnValueFilterA,
                                    singleColumnValueFilterB));
                scan.setFilter(filter);
        rs = table.getScanner(scan);
        if(colFamilyName1.equals(colFamilyName2))
        {
        while((res=rs.next()) != null)
        {
            HashMap map = new HashMap();
            String s = null;
            for(int j=0 ; j < colNames1.length ; j++)
            {
            byte[] obtainedRow = res.getValue(Bytes.toBytes(colFamilyName1),Bytes.toBytes(colNames1[j]));
            s = Bytes.toString(obtainedRow);
            map.put(colNames1[j],s);
            }
            al.add(map);
        }
        }
        else
        {
            while((res=rs.next()) != null)
            {
                HashMap map = new HashMap();
                String s = null;
                // extract row of the first col family
                for(int j=0 ; j < colNames1.length ; j++)
                {
                byte[] obtainedRow = res.getValue(Bytes.toBytes(colFamilyName1),Bytes.toBytes(colNames1[j]));
                s = Bytes.toString(obtainedRow);
                //System.out.println(s);
                map.put(colNames1[j],s);
                }
                // extract row of the second col family
                for(int k=0 ; k < colNames2.length ; k++)
                {
                byte[] obtainedRow = res.getValue(Bytes.toBytes(colFamilyName2),Bytes.toBytes(colNames2[k]));
                s = Bytes.toString(obtainedRow);
                map.put(colNames2[k],s);
                }
                // put both in the arraylist
                al.add(map);
            }
        }
        } catch (IOException e)
        {
            System.out.println("Exception occured in retrieving data");
        }
        finally
        {
            System.out.println("in intersection");
            rs.close();
        }
        return al;
   }

public static void main(String args[])
    {
        HBaseTest test = new HBaseTest();
        String tableName = "testing_table" ;
        String [] colFamilyNames = {"colFamily1","colFamily2"};
        String [][] colNames = {{"Id","Name"},{"Addr","Designation"}};

    Set set = test.distinct(tableName,"colFamily1","Name");
    Iterator iterator = set.iterator();
    while(iterator.hasNext())
        {
            String valofKey = (String) iterator.next();
            System.out.println(valofKey + "=" + newMap.get(valofKey));
        }

       HashMap map = new HashMap();
       map = operationObj.distinctWithOccurances(tableName, "Name", "Designation");

       ArrayList> al_having = new ArrayList>();
       al_having = operationObj.havingWithEntireRow(tableName, colFamilyNames, colNames, "colFamily1", "Name", "Jayati");

       String [] reqdFieldNames1 = {"Id","Name"};
       String [] reqdFieldNames2 = {"Id","Name"};
       ArrayList> al_intersection = new ArrayList>();
       al_intersection = operationObj.intersection(tableName, colFamily1, colFamily2,reqdFieldNames1,reqdFieldNames2,"Id","Name","1","Jayati");

    // similarly union and between can be used
    }
}

Utilize HBase basic read-write functions

HBase is an open source, non-relational, distributed database providing BigTable like capabilities for Hadoop. Tables in HBase can be accessed using the Java-API for HBase but unfortunately a developer would require to put in a lot of efforts to do so. That is because the API provides a very restricted set of functions. For those new to API , it takes a lot of time to understand the available classes and use them to get the required job done.

So to enable easy handling of HBase tables, I have developed a wrapper library over the existing API which provides basic methods to create, read , delete records in hbase table and also another set of functions such as distinct, having, between, intersection, union which work for HBase just as we have these working in SQL. A big fraction of our work on tables depends on these functions and their availability makes using the HBase API easy.

This post includes a sample program to illustrate the usage of read and write functions only which specifically includes the following operations :

Adding entry to a single column
Adding records with a single column family and multiple columns
Adding a row with any number of column families and columns
Obtaining a single column entry
Obtaining the entire row
Reading all entries of a particular column of a table
Reading all records of an HBase Table
Deleting a record from an HBase Table

I have used hbase-0.20.6 and hadoop-0.20.1 and you could deploy this program on your eclipse and make test classes to check it.

HBase coprocessors

Why HBase Coprocessors?

HBase has very effective MapReduce integration for distributed computation over data stored within its tables, but in many cases – for example simple additive or aggregating operations like summing, counting, and the like – pushing the computation up to the server where it can operate on the data directly without communication overheads can give a dramatic performance improvement over HBase’s already good scanning performance.
Also, before 0.92, it was not possible to extend HBase with custom functionality except by extending the base classes.

What are HBase Coprocessors?

In order to support sufficient flexibility for potential coprocessor behaviors, two different aspects of extension are provided by the framework. One is the observer, which are like triggers in conventional databases, and the other is the endpoint, dynamic RPC endpoints that resemble stored procedures.

What can HBase Coprocessors be used for?

exciting new features can be built on top of it, for example secondary indexing, complex filtering (push down predicates), and access control.

These are just a couple of interesting points from this excellent article. I strongly suggest reading it.

via: https://blogs.apache.org/hbase/entry/coprocessor_introduction

Wednesday 3 October 2012

HAdoop on eclipse

Install Hadoop plug-in

The next step is to install and check the Hadoop plug-in for Eclipse.

Download the eclipse for linux and perform tar -xvzf eclipse.tar.gz
Copy the file "hadoop-0.19.1-eclipse-plugin.jar" from the Hadoop eclipse-plugin folder to the Eclipse plugins folder as shown in the figure below.

Copy Hadoop Eclipse Plugin
Close both windows
Start Eclipse
Configre jdk if it is not done by adding a line in eclipse.ini file -vm path of jdk bin
Click on the open perspective icon ,which is usually located in the upper-right corner the eclipse application. Then select Other from the menu.
Select Map/Reduce from the list of perspectives and press "OK" button.
As a result your IDE should open a new perspective that looks similar to the image below.

Eclipse Map/Reduce Perspective

Now that the we installed and configured hadoop cluster and eclipse plugin i's a time to test the setup by running a simple project.

Setup Hadoop Location in Eclipse

Next step is to configure Hadoop location in the Eclipse environment.

here

Launch the Eclipse environment.
Open Map/Reduce perspective by clicking on the open perspective icon (), select "Other" from the menu, and then select "Map/Reduce" from the list of perspectives.
After switching to the Map/Reduce perspective, select the Map/Reduce Locations tab located at the bottom of the Eclipse environment. Then right click on the blank space in that tab and select "New Hadoop location...." from the context menu. You should see a dialog box similar to the one shown below.

Setting up new Map/Reduce location
Fill in the following items, as shown on the figure above.
- Location Name -- localhost
- Map/Reduce Master
  - Host -- localhost
  - Port -- 9101
- DFS Master
  - Check "Use M/R Master Host"
  - Port -- 9100
- User name -- User
Then press the Finish button.
After closing the Hadoop location settings dialog you should see a new location in the "Map/Reduce Locations" tab.
In the Project Explorer tab on the left hand side of the Eclipse window, find the DFS Locations item. Open it using the "+" icon on its left. Inside, you should see the localhost location reference with the blue elephant icon. Keep opening the items below it until you see something like the image below.
Browsing HDFS location

You can now move on to the next step.

Creating and configuring Hadoop eclipse project.

Launch Eclipse.
Right-click on the blank space in the Project Explorer window and select New -> Project.. to create a new project.
Select Map/Reduce Project from the list of project types as shown in the image below.
Press the Next button.
You will see the project properties window similar to the one shown below
Fill in the project name and click on Configure Hadoop Installation link on the right hand side of the project configuration window. This will bring up the project Preferences window shown in the image below.
In the project Preferences window enter the location of the Hadoop directory in the Hadoop installation directory field as shown above.

If you are not sure of the location of the Hadoop home directory, refer to Step 1 of this section. Hadoop home directory is one level up from the conf directory.
After entering the location close the Preferences window by pressing the OK button. Then close the Project window with the Finish button.
You have now created your first Hadoop Eclipse project. You should see its name in the Project Explorer tab.

Creating Map/Reduce driver class

Right-click on the newly created Hadoop project in the Project Explorer tab and select New -> Other from the context menu.
Go to the Map/Reduce folder, select MapReduceDriver, then press the Next button as shown in the image below.
When the MapReduce Driver wizard appears, enter TestDriver in the Name field and press the Finish button. This will create the skeleton code for the MapReduce Driver.
Unfortunately the Hadoop plug-in for Eclipse is slightly out of step with the recent Hadoop API, so we need to edit the driver code a bit.

Find the following two lines in the source code and comment them out:

conf.setInputPath(new Path("src"));
conf.setOutputPath(new Path("out"));
Enter the following code immediatly after the two lines you just commented out (see image below):

conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);

FileInputFormat.setInputPaths(conf, new Path("In"));
FileOutputFormat.setOutputPath(conf, new Path("Out"));
After you have changed the code, you will see the new lines marked as incorrect by Eclipse. Click on the error icon for each line and select Eclipse's suggestion to import the missing class.

You need to import the following classes: TextInputFormat, TextOutputFormat, FileInputFormat, FileOutputFormat.
After the missing classes are imported you are ready to run the project.

Running Hadoop Project

Right-click on the TestDriver class in the Project Explorer tab and select Run As --> Run on Hadoop. This will bring up a window like the one shown below.
In the window shown above select "Choose existing Hadoop location" , then select localhost from the list below. After that click Finish button to start your project.
If you see console output similar to the one shown below, Congratulations! You have started the project successfully!