Wednesday, May 11, 2016

Pig Script for Comparison of 2 Big Files

Pig Script for Comparison of 2 Files

Goals

  • Find out whether both the files/folders have same number of lines
  • Find out whether both the files/folders have same number of tuples when grouped by all dimensions
  • Find out all the tuples in a Group of one File should be having a one to one match(Measures will match with a tolerance) in another file/folder

Script


#!/bin/bash

FIRST_FILE="/input/output-mapred.txt";
SECOND_FILE="/input/output-spork.txt";
OUTPUT_FILE="/input/output-diff.txt";
DIMENSION_STRING="1,2,4"
MEASURE_STRING="3";
UNIVERSAL_TOLERANCE="2";
PIG_STORAGE_DELIMETER="\t";
DIFF_PIG_SCRIPT_LOCATION="/tmp/diff-f1.pig";
DIFF_PIG_SCRIPT_LOCATION_REVERSE="/tmp/diff-f2";

function usage () {
 me=$(basename $BASH_SOURCE)
 echo 
 echo "$me script helps to get diff of 2 files in HDFS with given tolerance."
    echo 
    echo "Usage : "
    echo "./$me [Options]"
    echo " Options : "
    echo -e "\t -h --help"
    echo -e "\t\t - Print this"
    echo -e "\t -file1=[ABSOLUTE_FILE_PATH]" 
    echo -e "\t\t - File/Folder to compare with"
    echo -e "\t -file2=[ABSOLUTE_FILE_PATH]" 
    echo -e "\t\t - File/Folder to compare to" 
    echo -e "\t -outputfile=[ABSOLUTE_FILE_PATH]" 
    echo -e "\t\t - File/Folder to save difference to" 
    echo -e "\t -dimpos=[COMMA_SEPERATED_DIMENSIONS_POSITIONS]" 
    echo -e "\t\t - Positions of Dimensions in files"
    echo -e "\t -meapos=[COMMA_SEPERATED_MEASURES_POSITIONS]" 
    echo -e "\t\t - Positions of Measures in files"
    echo -e "\t -tollimit=[TOLERANCE_LIMIT]" 
    echo -e "\t\t - Tolerance limit when comparing measures"
    echo -e "\t -delimeter=[DELIMETER]" 
    echo -e "\t\t - Delimeter to be used in PigStorage. Default is tab"
}

if [ -z "$1" ] 
then
    usage
    exit
fi

while [ "$1" != "" ]; do
    PARAM=`echo $1 | awk -F= '{print $1}'`
    VALUE=`echo $1 | awk -F= '{print $2}'`
    case $PARAM in
        -h | --help)
            usage
            exit
            ;;
 -tollimit)
            UNIVERSAL_TOLERANCE=$VALUE
            ;;
 -delimeter)
     PIG_STORAGE_DELIMETER=$VALUE
     ;;
 -meapos)
            MEASURE_STRING=$VALUE
            ;;
 -dimpos)
            DIMENSION_STRING=$VALUE
            ;;
 -file2)
            SECOND_FILE=$VALUE
            ;;
 -file1)
     FIRST_FILE=$VALUE
     ;;
 -outputfile)
     OUTPUT_FILE=$VALUE
     ;;
 *)
            echo "ERROR: unknown parameter \"$PARAM\""
            usage
            exit 1
            ;;
    esac
    shift
done

#Convert Strings to Arrays
IFS=', ' read -r -a DIMENSIONS_ARRAY <<< "$DIMENSION_STRING"
IFS=', ' read -r -a MEASURES_ARRAY <<< "$MEASURE_STRING"

containsElement () {
  local e
  for e in "${@:2}"; do [[ "$e" == "$1" ]] && return 0; done
  return 1
}

#Check if any column is in both Dimensions and Measures 
for element in "${MEASURES_ARRAY[@]}"
do
  containsElement "$element" ${DIMENSIONS_ARRAY[@]}
  if [ $? = 0 ] ; then
 echo "I am confused about column $element .. its both in Dimensions and Measures .. "
 echo "Can't go forward ... exiting ... " 
 exit 1;
  fi
done

MEASURES_SIZE=${#MEASURES_ARRAY[@]};
DIMENSIONS_SIZE=${#DIMENSIONS_ARRAY[@]};
COLUMNS_SIZE=$MEASURES_SIZE+$DIMENSIONS_SIZE;

SCHEMA_STRING="";
ALL_DIMENSIONS="";
ALL_MEASURES="";
for element in "${DIMENSIONS_ARRAY[@]}"
do
   SCHEMA_STRING="$SCHEMA_STRING , field$element:chararray";
 ALL_DIMENSIONS="$ALL_DIMENSIONS , field$element";
done
for element in "${MEASURES_ARRAY[@]}"
do
        SCHEMA_STRING="$SCHEMA_STRING , field$element:float";
 ALL_MEASURES="$ALL_MEASURES , field$element";
done

SCHEMA_STRING=`echo "$SCHEMA_STRING" | sed 's/ , //'`;
ALL_DIMENSIONS=`echo "$ALL_DIMENSIONS" | sed 's/ , //'`;
ALL_MEASURES=`echo "$ALL_MEASURES" | sed 's/ , //'`;

#Starting to create pig script
SCRIPT_CONTENT="";
echo "f1 = LOAD '$FIRST_FILE' USING PigStorage('$PIG_STORAGE_DELIMETER') as ($SCHEMA_STRING);" >$DIFF_PIG_SCRIPT_LOCATION
echo "f2 = LOAD '$SECOND_FILE' USING PigStorage('$PIG_STORAGE_DELIMETER') as ($SCHEMA_STRING);" >>$DIFF_PIG_SCRIPT_LOCATION
echo "g_f1 = GROUP f1 BY ($ALL_DIMENSIONS);" >>$DIFF_PIG_SCRIPT_LOCATION
echo "g_f2 = GROUP f2 BY ($ALL_DIMENSIONS);" >>$DIFF_PIG_SCRIPT_LOCATION
echo "g_f1_flatten  = FOREACH g_f1 GENERATE FLATTEN (group) as ($ALL_DIMENSIONS)" >>$DIFF_PIG_SCRIPT_LOCATION
TMP="";
for element in "${MEASURES_ARRAY[@]}" 
do
  TMP="$TMP , SUM(f1.field$element)  as field$element" 
done
echo "$TMP;" >>$DIFF_PIG_SCRIPT_LOCATION
echo "g_f2_flatten  = FOREACH g_f2 GENERATE FLATTEN (group) as ($ALL_DIMENSIONS)" >>$DIFF_PIG_SCRIPT_LOCATION
TMP="";
for element in "${MEASURES_ARRAY[@]}" 
do
        TMP="$TMP , SUM(f2.field$element)  as field$element"
done
echo "$TMP;" >>$DIFF_PIG_SCRIPT_LOCATION
echo "joined = JOIN g_f1_flatten  by ($ALL_DIMENSIONS),g_f2_flatten  by ($ALL_DIMENSIONS);" >>$DIFF_PIG_SCRIPT_LOCATION
echo "list = FOREACH joined " >>$DIFF_PIG_SCRIPT_LOCATION
echo "GENERATE " >>$DIFF_PIG_SCRIPT_LOCATION
for element in "${MEASURES_ARRAY[@]}" 
do
 echo "g_f1_flatten::field$element as f1field$element, g_f2_flatten::field$element as f2field$element," >>$DIFF_PIG_SCRIPT_LOCATION
done
count=${#DIMENSIONS_ARRAY[@]};
for element in "${DIMENSIONS_ARRAY[@]}"
do
 echo "g_f1_flatten::field$element as f1field$element, g_f2_flatten::field$element as f2field$element" >>$DIFF_PIG_SCRIPT_LOCATION
 if [ "$count" -ge 2 ];then
                echo "," >>$DIFF_PIG_SCRIPT_LOCATION
        fi
        count=`expr $count - 1`;
done
echo ";" >>$DIFF_PIG_SCRIPT_LOCATION
echo "filtered_list = FILTER list by " >>$DIFF_PIG_SCRIPT_LOCATION
TMP="";
for element in "${MEASURES_ARRAY[@]}"
do
 TMP="$TMP  AND ((f1field$element-f2field$element>$UNIVERSAL_TOLERANCE) OR (f1field$element-f2field$element<-$UNIVERSAL_TOLERANCE))";
done
TMP=`echo $TMP | sed 's/AND//'`;
echo $TMP >>$DIFF_PIG_SCRIPT_LOCATION
echo ";" >>$DIFF_PIG_SCRIPT_LOCATION
echo "store filtered_list INTO '$OUTPUT_FILE';" >>$DIFF_PIG_SCRIPT_LOCATION
cat $DIFF_PIG_SCRIPT_LOCATION
  • chmod +x generateDiffPigScripts.sh
Usage
Icon
generateDiffPigScripts.sh script helps to get diff of 2 files in HDFS with given tolerance.
Usage : 
./generateDiffPigScripts.sh [Options]
 Options : 
  -h --help
   - Print this
  -file1=[ABSOLUTE_PATH]
   - File/Folder to compare with
  -file2=[ABSOLUTE_PATH]
   - File/FOLDER to compare to
  -dimpos=[COMMA_SEPERATED_DIMENSIONS_POSITIONS]
   - Positions of Dimensions in files
  -meapos=[COMMA_SEPERATED_MEASURES_POSITIONS]
   - Positions of Measures in files
  -tollimit=[TOLERANCE_LIMIT]
   - Tolerance limit when comparing measures
  -delimeter=[DELIMETER]
   - Delimeter to be used in PigStorage. Default is tab
Usage Example
Icon
  • ./generateDiffPigScripts.sh -file2="/input/output-mapred.txt" -file1="/input/output-spork.txt" -dimpos="1,2,4,5" -meapos="3,6" -tollimit="2" -delimeter='\u0001'
  • This will generate a file /tmp/diff-f1.pig
  • /usr/hdp/current/pig-client/bin/pig /tmp/diff-f1.pig
  • This will show difference of file2 w.r.t file1

Input Files

  • hadoop fs -cat /input/output-mapred.txt
        yagna dande 28.5 company
    yagna dande 26.5 company
    yagna dande 29.5 company
    hemanth dande 33 planetsoft
    hema ch 28 satyam
    giri ch 32 narayana

diff 1 2 - Tolerance is 2.0

yagna,dande,company,26.5

  • hadoop fs -cat /input/output-spork.txt
giri ch 32 narayana
hema ch 28 satyam
hemanth dande 33 planetsoft
yagna dande 29 company
yagna dande 39 company
yagna1 dande 29 company

diff 2 1 - Tolerance is 2.0

yagna,dande,company,39.0
yagna1,dande,company,29.0

Proof for Columns order will be intact when run with MapReduce and Spark

  1. wget http://hortonassets.s3.amazonaws.com/pig/lahman591-csv.zip
  2. unzip it
  3. upload Master.csv and Batting.csv
  4. Execute pig Script
batting = load '/input/Batting.csv' using PigStorage(',');
raw_runs = FILTER batting BY $1>0;
runs = FOREACH raw_runs GENERATE $0 as playerID, $1 as year, $8 as runs;
grp_data = GROUP runs by (year);
max_runs = FOREACH grp_data GENERATE group as grp,MAX(runs.runs) as max_runs;
join_max_run = JOIN max_runs by ($0, max_runs), runs by (year,runs);
join_data = FOREACH join_max_run GENERATE $0 as year, $2 as playerID, $1 as runs;
DUMP join_data;

Spark

(1966,aloufe01,122.0)
(1966,robinfr02,122.0)
(1876,barnero01,126.0)
(1957,mantlmi01,121.0)
(1872,eggleda01,94.0)
(1894,hamilbi01,192.0)
(1904,brownge01,99.0)
(1906,chancfr01,103.0)
(1906,wagneho01,103.0)
(1955,snidedu01,126.0)
(1913,collied01,125.0)
(1940,willite01,134.0)
(1902,fultzda01,109.0)
(1902,hartsto01,109.0)
(1889,griffmi01,152.0)
(1889,stoveha01,152.0)
(1937,dimagjo01,151.0)
(1926,ruthba01,139.0)
(1878,highadi01,60.0)
(1980,wilsowi02,133.0)
(1951,kinerra01,124.0)
(1951,musiast01,124.0)
(1997,biggicr01,146.0)
(1968,beckegl01,98.0)
(1939,rolfere01,139.0)
(1931,gehrilo01,163.0)
(1908,mcintma01,105.0)
(2006,sizemgr01,134.0)
(1944,stirnsn01,125.0)
(1977,carewro01,128.0)
(1911,cobbty01,147.0)
(1962,robinfr02,134.0)
(1915,cobbty01,144.0)
(1960,mantlmi01,119.0)
(1896,burkeje01,160.0)
(1933,gehrilo01,138.0)
(1883,stoveha01,110.0)
(1942,willite01,141.0)
(1900,thomaro01,132.0)
(1890,duffyhu01,161.0)
(1953,snidedu01,132.0)
(1986,henderi01,130.0)
(1975,rosepe01,112.0)
(1898,mcgrajo01,143.0)
(1964,allendi01,125.0)
(2000,bagweje01,152.0)
(1881,gorege01,86.0)
(1988,boggswa01,128.0)
(1922,hornsro01,141.0)
(1971,brocklo01,126.0)
(2002,soriaal01,128.0)
(1979,baylodo01,120.0)
(1874,mcveyca01,91.0)
(1999,bagweje01,143.0)
(1892,childcu01,136.0)
(2008,ramirha01,125.0)
(2004,pujolal01,133.0)
(1982,molitpa01,136.0)
(1973,bondsbo01,131.0)
(1887,oneilti01,167.0)
(1885,stoveha01,130.0)
(1924,ruthba01,143.0)
(1917,bushdo01,112.0)
(1991,molitpa01,133.0)
(1935,galanau01,133.0)
(1920,ruthba01,158.0)
(1959,pinsova01,131.0)
(1919,ruthba01,103.0)
(1928,ruthba01,163.0)
(1995,biggicr01,123.0)
(2011,grandcu01,136.0)
(1948,henrito01,138.0)
(1993,dykstle01,143.0)
(1984,evansdw01,121.0)
(1946,willite01,142.0)
(1921,ruthba01,177.0)
(1893,longhe01,149.0)
(1990,henderi01,119.0)
(1934,gehrich01,134.0)
(2010,pujolal01,115.0)
(1918,grohhe01,86.0)
(1901,lajoina01,145.0)
(1996,burksel01,142.0)
(1963,aaronha01,121.0)
(1923,ruthba01,151.0)
(1932,kleinch01,152.0)
(1916,cobbty01,113.0)
(1877,orourji01,68.0)
(1958,mantlmi01,127.0)
(1978,lefloro01,126.0)
(2001,sosasa01,146.0)
(1936,gehrilo01,167.0)
(1974,rosepe01,110.0)
(1965,harpeto01,126.0)
(1965,versazo01,126.0)
(1947,mizejo01,137.0)
(1873,barnero01,125.0)
(1956,mantlmi01,132.0)
(1985,henderi01,146.0)
(1888,pinknge01,134.0)
(1994,thomafr04,106.0)
(2003,pujolal01,137.0)
(1884,dunlafr01,160.0)
(1970,willibi01,137.0)
(1967,aaronha01,113.0)
(1967,brocklo01,113.0)
(1871,barnero01,66.0)
(1910,mageesh01,110.0)
(1925,cuyleki01,144.0)
(2007,rodrial01,143.0)
(1938,greenha01,144.0)
(1981,henderi01,89.0)
(1929,hornsro01,156.0)
(1927,ruthba01,158.0)
(1905,donlimi01,124.0)
(1912,collied01,137.0)
(1914,collied01,122.0)
(1943,vaughar01,112.0)
(1983,raineti01,133.0)
(1972,morgajo02,122.0)
(1976,rosepe01,130.0)
(1969,jacksre01,123.0)
(1895,hamilbi01,166.0)
(1941,willite01,135.0)
(1945,stanked01,128.0)
(1998,sosasa01,134.0)
(1954,mantlmi01,129.0)
(1897,hamilbi01,152.0)
(1907,shannsp01,104.0)
(2005,pujolal01,129.0)
(1879,jonesch01,85.0)
(1903,beaumgi01,137.0)
(1930,kleinch01,158.0)
(1909,leachto01,126.0)
(2009,pujolal01,124.0)
(1952,hemusso01,105.0)
(1952,musiast01,105.0)
(1875,barnero01,115.0)
(1880,dalryab01,91.0)
(1899,keelewi01,140.0)
(1899,mcgrajo01,140.0)
(1949,willite01,150.0)
(1989,boggswa01,113.0)
(1961,mantlmi01,132.0)
(1961,marisro01,132.0)
(1891,brownto01,177.0)
(1987,raineti01,123.0)
(1992,phillto02,114.0)
(1886,kellyki01,155.0)
(1882,gorege01,99.0)
(1950,dimagdo01,131.0)

Map Reduce

(1871,barnero01,66.0)
(1872,eggleda01,94.0)
(1873,barnero01,125.0)
(1874,mcveyca01,91.0)
(1875,barnero01,115.0)
(1876,barnero01,126.0)
(1877,orourji01,68.0)
(1878,highadi01,60.0)
(1879,jonesch01,85.0)
(1880,dalryab01,91.0)
(1881,gorege01,86.0)
(1882,gorege01,99.0)
(1883,stoveha01,110.0)
(1884,dunlafr01,160.0)
(1885,stoveha01,130.0)
(1886,kellyki01,155.0)
(1887,oneilti01,167.0)
(1888,pinknge01,134.0)
(1889,griffmi01,152.0)
(1889,stoveha01,152.0)
(1890,duffyhu01,161.0)
(1891,brownto01,177.0)
(1892,childcu01,136.0)
(1893,longhe01,149.0)
(1894,hamilbi01,192.0)
(1895,hamilbi01,166.0)
(1896,burkeje01,160.0)
(1897,hamilbi01,152.0)
(1898,mcgrajo01,143.0)
(1899,keelewi01,140.0)
(1899,mcgrajo01,140.0)
(1900,thomaro01,132.0)
(1901,lajoina01,145.0)
(1902,hartsto01,109.0)
(1902,fultzda01,109.0)
(1903,beaumgi01,137.0)
(1904,brownge01,99.0)
(1905,donlimi01,124.0)
(1906,wagneho01,103.0)
(1906,chancfr01,103.0)
(1907,shannsp01,104.0)
(1908,mcintma01,105.0)
(1909,leachto01,126.0)
(1910,mageesh01,110.0)
(1911,cobbty01,147.0)
(1912,collied01,137.0)
(1913,collied01,125.0)
(1914,collied01,122.0)
(1915,cobbty01,144.0)
(1916,cobbty01,113.0)
(1917,bushdo01,112.0)
(1918,grohhe01,86.0)
(1919,ruthba01,103.0)
(1920,ruthba01,158.0)
(1921,ruthba01,177.0)
(1922,hornsro01,141.0)
(1923,ruthba01,151.0)
(1924,ruthba01,143.0)
(1925,cuyleki01,144.0)
(1926,ruthba01,139.0)
(1927,ruthba01,158.0)
(1928,ruthba01,163.0)
(1929,hornsro01,156.0)
(1930,kleinch01,158.0)
(1931,gehrilo01,163.0)
(1932,kleinch01,152.0)
(1933,gehrilo01,138.0)
(1934,gehrich01,134.0)
(1935,galanau01,133.0)
(1936,gehrilo01,167.0)
(1937,dimagjo01,151.0)
(1938,greenha01,144.0)
(1939,rolfere01,139.0)
(1940,willite01,134.0)
(1941,willite01,135.0)
(1942,willite01,141.0)
(1943,vaughar01,112.0)
(1944,stirnsn01,125.0)
(1945,stanked01,128.0)
(1946,willite01,142.0)
(1947,mizejo01,137.0)
(1948,henrito01,138.0)
(1949,willite01,150.0)
(1950,dimagdo01,131.0)
(1951,kinerra01,124.0)
(1951,musiast01,124.0)
(1952,hemusso01,105.0)
(1952,musiast01,105.0)
(1953,snidedu01,132.0)
(1954,mantlmi01,129.0)
(1955,snidedu01,126.0)
(1956,mantlmi01,132.0)
(1957,mantlmi01,121.0)
(1958,mantlmi01,127.0)
(1959,pinsova01,131.0)
(1960,mantlmi01,119.0)
(1961,marisro01,132.0)
(1961,mantlmi01,132.0)
(1962,robinfr02,134.0)
(1963,aaronha01,121.0)
(1964,allendi01,125.0)
(1965,harpeto01,126.0)
(1965,versazo01,126.0)
(1966,robinfr02,122.0)
(1966,aloufe01,122.0)
(1967,aaronha01,113.0)
(1967,brocklo01,113.0)
(1968,beckegl01,98.0)
(1969,jacksre01,123.0)
(1970,willibi01,137.0)
(1971,brocklo01,126.0)
(1972,morgajo02,122.0)
(1973,bondsbo01,131.0)
(1974,rosepe01,110.0)
(1975,rosepe01,112.0)
(1976,rosepe01,130.0)
(1977,carewro01,128.0)
(1978,lefloro01,126.0)
(1979,baylodo01,120.0)
(1980,wilsowi02,133.0)
(1981,henderi01,89.0)
(1982,molitpa01,136.0)
(1983,raineti01,133.0)
(1984,evansdw01,121.0)
(1985,henderi01,146.0)
(1986,henderi01,130.0)
(1987,raineti01,123.0)
(1988,boggswa01,128.0)
(1989,boggswa01,113.0)
(1990,henderi01,119.0)
(1991,molitpa01,133.0)
(1992,phillto02,114.0)
(1993,dykstle01,143.0)
(1994,thomafr04,106.0)
(1995,biggicr01,123.0)
(1996,burksel01,142.0)
(1997,biggicr01,146.0)
(1998,sosasa01,134.0)
(1999,bagweje01,143.0)
(2000,bagweje01,152.0)
(2001,sosasa01,146.0)
(2002,soriaal01,128.0)
(2003,pujolal01,137.0)
(2004,pujolal01,133.0)
(2005,pujolal01,129.0)
(2006,sizemgr01,134.0)
(2007,rodrial01,143.0)
(2008,ramirha01,125.0)
(2009,pujolal01,124.0)
(2010,pujolal01,115.0)
(2011,grandcu01,136.0)