Pig Script for Comparison of 2 Files
Goals
- Find out whether both the files/folders have same number of lines
- Find out whether both the files/folders have same number of tuples when grouped by all dimensions
- Find out all the tuples in a Group of one File should be having a one to one match(Measures will match with a tolerance) in another file/folder
Script
#!/bin/bash
FIRST_FILE="/input/output-mapred.txt";
SECOND_FILE="/input/output-spork.txt";
OUTPUT_FILE="/input/output-diff.txt";
DIMENSION_STRING="1,2,4"
MEASURE_STRING="3";
UNIVERSAL_TOLERANCE="2";
PIG_STORAGE_DELIMETER="\t";
DIFF_PIG_SCRIPT_LOCATION="/tmp/diff-f1.pig";
DIFF_PIG_SCRIPT_LOCATION_REVERSE="/tmp/diff-f2";
function usage () {
me=$(basename $BASH_SOURCE)
echo
echo "$me script helps to get diff of 2 files in HDFS with given tolerance."
echo
echo "Usage : "
echo "./$me [Options]"
echo " Options : "
echo -e "\t -h --help"
echo -e "\t\t - Print this"
echo -e "\t -file1=[ABSOLUTE_FILE_PATH]"
echo -e "\t\t - File/Folder to compare with"
echo -e "\t -file2=[ABSOLUTE_FILE_PATH]"
echo -e "\t\t - File/Folder to compare to"
echo -e "\t -outputfile=[ABSOLUTE_FILE_PATH]"
echo -e "\t\t - File/Folder to save difference to"
echo -e "\t -dimpos=[COMMA_SEPERATED_DIMENSIONS_POSITIONS]"
echo -e "\t\t - Positions of Dimensions in files"
echo -e "\t -meapos=[COMMA_SEPERATED_MEASURES_POSITIONS]"
echo -e "\t\t - Positions of Measures in files"
echo -e "\t -tollimit=[TOLERANCE_LIMIT]"
echo -e "\t\t - Tolerance limit when comparing measures"
echo -e "\t -delimeter=[DELIMETER]"
echo -e "\t\t - Delimeter to be used in PigStorage. Default is tab"
}
if [ -z "$1" ]
then
usage
exit
fi
while [ "$1" != "" ]; do
PARAM=`echo $1 | awk -F= '{print $1}'`
VALUE=`echo $1 | awk -F= '{print $2}'`
case $PARAM in
-h | --help)
usage
exit
;;
-tollimit)
UNIVERSAL_TOLERANCE=$VALUE
;;
-delimeter)
PIG_STORAGE_DELIMETER=$VALUE
;;
-meapos)
MEASURE_STRING=$VALUE
;;
-dimpos)
DIMENSION_STRING=$VALUE
;;
-file2)
SECOND_FILE=$VALUE
;;
-file1)
FIRST_FILE=$VALUE
;;
-outputfile)
OUTPUT_FILE=$VALUE
;;
*)
echo "ERROR: unknown parameter \"$PARAM\""
usage
exit 1
;;
esac
shift
done
#Convert Strings to Arrays
IFS=', ' read -r -a DIMENSIONS_ARRAY <<< "$DIMENSION_STRING"
IFS=', ' read -r -a MEASURES_ARRAY <<< "$MEASURE_STRING"
containsElement () {
local e
for e in "${@:2}"; do [[ "$e" == "$1" ]] && return 0; done
return 1
}
#Check if any column is in both Dimensions and Measures
for element in "${MEASURES_ARRAY[@]}"
do
containsElement "$element" ${DIMENSIONS_ARRAY[@]}
if [ $? = 0 ] ; then
echo "I am confused about column $element .. its both in Dimensions and Measures .. "
echo "Can't go forward ... exiting ... "
exit 1;
fi
done
MEASURES_SIZE=${#MEASURES_ARRAY[@]};
DIMENSIONS_SIZE=${#DIMENSIONS_ARRAY[@]};
COLUMNS_SIZE=$MEASURES_SIZE+$DIMENSIONS_SIZE;
SCHEMA_STRING="";
ALL_DIMENSIONS="";
ALL_MEASURES="";
for element in "${DIMENSIONS_ARRAY[@]}"
do
SCHEMA_STRING="$SCHEMA_STRING , field$element:chararray";
ALL_DIMENSIONS="$ALL_DIMENSIONS , field$element";
done
for element in "${MEASURES_ARRAY[@]}"
do
SCHEMA_STRING="$SCHEMA_STRING , field$element:float";
ALL_MEASURES="$ALL_MEASURES , field$element";
done
SCHEMA_STRING=`echo "$SCHEMA_STRING" | sed 's/ , //'`;
ALL_DIMENSIONS=`echo "$ALL_DIMENSIONS" | sed 's/ , //'`;
ALL_MEASURES=`echo "$ALL_MEASURES" | sed 's/ , //'`;
#Starting to create pig script
SCRIPT_CONTENT="";
echo "f1 = LOAD '$FIRST_FILE' USING PigStorage('$PIG_STORAGE_DELIMETER') as ($SCHEMA_STRING);" >$DIFF_PIG_SCRIPT_LOCATION
echo "f2 = LOAD '$SECOND_FILE' USING PigStorage('$PIG_STORAGE_DELIMETER') as ($SCHEMA_STRING);" >>$DIFF_PIG_SCRIPT_LOCATION
echo "g_f1 = GROUP f1 BY ($ALL_DIMENSIONS);" >>$DIFF_PIG_SCRIPT_LOCATION
echo "g_f2 = GROUP f2 BY ($ALL_DIMENSIONS);" >>$DIFF_PIG_SCRIPT_LOCATION
echo "g_f1_flatten = FOREACH g_f1 GENERATE FLATTEN (group) as ($ALL_DIMENSIONS)" >>$DIFF_PIG_SCRIPT_LOCATION
TMP="";
for element in "${MEASURES_ARRAY[@]}"
do
TMP="$TMP , SUM(f1.field$element) as field$element"
done
echo "$TMP;" >>$DIFF_PIG_SCRIPT_LOCATION
echo "g_f2_flatten = FOREACH g_f2 GENERATE FLATTEN (group) as ($ALL_DIMENSIONS)" >>$DIFF_PIG_SCRIPT_LOCATION
TMP="";
for element in "${MEASURES_ARRAY[@]}"
do
TMP="$TMP , SUM(f2.field$element) as field$element"
done
echo "$TMP;" >>$DIFF_PIG_SCRIPT_LOCATION
echo "joined = JOIN g_f1_flatten by ($ALL_DIMENSIONS),g_f2_flatten by ($ALL_DIMENSIONS);" >>$DIFF_PIG_SCRIPT_LOCATION
echo "list = FOREACH joined " >>$DIFF_PIG_SCRIPT_LOCATION
echo "GENERATE " >>$DIFF_PIG_SCRIPT_LOCATION
for element in "${MEASURES_ARRAY[@]}"
do
echo "g_f1_flatten::field$element as f1field$element, g_f2_flatten::field$element as f2field$element," >>$DIFF_PIG_SCRIPT_LOCATION
done
count=${#DIMENSIONS_ARRAY[@]};
for element in "${DIMENSIONS_ARRAY[@]}"
do
echo "g_f1_flatten::field$element as f1field$element, g_f2_flatten::field$element as f2field$element" >>$DIFF_PIG_SCRIPT_LOCATION
if [ "$count" -ge 2 ];then
echo "," >>$DIFF_PIG_SCRIPT_LOCATION
fi
count=`expr $count - 1`;
done
echo ";" >>$DIFF_PIG_SCRIPT_LOCATION
echo "filtered_list = FILTER list by " >>$DIFF_PIG_SCRIPT_LOCATION
TMP="";
for element in "${MEASURES_ARRAY[@]}"
do
TMP="$TMP AND ((f1field$element-f2field$element>$UNIVERSAL_TOLERANCE) OR (f1field$element-f2field$element<-$UNIVERSAL_TOLERANCE))";
done
TMP=`echo $TMP | sed 's/AND//'`;
echo $TMP >>$DIFF_PIG_SCRIPT_LOCATION
echo ";" >>$DIFF_PIG_SCRIPT_LOCATION
echo "store filtered_list INTO '$OUTPUT_FILE';" >>$DIFF_PIG_SCRIPT_LOCATION
cat $DIFF_PIG_SCRIPT_LOCATION
- chmod +x generateDiffPigScripts.sh
Input Files
- hadoop fs -cat /input/output-mapred.txt
yagna dande 28.5 company
yagna dande 26.5
companyyagna dande 29.5
companyhemanth dande 33 planetsoft
hema ch 28 satyam
giri ch 32 narayana
diff 1 2 - Tolerance is 2.0
yagna,dande,company,26.5
- hadoop fs -cat /input/output-spork.txt
giri ch 32 narayana
hema ch 28 satyam
hemanth dande 33 planetsoft
yagna dande 29
companyyagna dande 39
companyyagna1 dande 29
companydiff 2 1 - Tolerance is 2.0
yagna,dande,company,39.0
yagna1,dande,company,29.0
Proof for Columns order will be intact when run with MapReduce and Spark
- wget http://hortonassets.s3.amazonaws.com/pig/lahman591-csv.zip
- unzip it
- upload M
aster.csv
and Batting.csv
- Execute pig Script
batting = load '/input/Batting.csv' using PigStorage(',');
raw_runs = FILTER batting BY $1>0;
runs = FOREACH raw_runs GENERATE $0 as playerID, $1 as year, $8 as runs;
grp_data = GROUP runs by (year);
max_runs = FOREACH grp_data GENERATE group as grp,MAX(runs.runs) as max_runs;
join_max_run = JOIN max_runs by ($0, max_runs), runs by (year,runs);
join_data = FOREACH join_max_run GENERATE $0 as year, $2 as playerID, $1 as runs;
DUMP join_data;
Spark
(1966,aloufe01,122.0)
(1966,robinfr02,122.0)
(1876,barnero01,126.0)
(1957,mantlmi01,121.0)
(1872,eggleda01,94.0)
(1894,hamilbi01,192.0)
(1904,brownge01,99.0)
(1906,chancfr01,103.0)
(1906,wagneho01,103.0)
(1955,snidedu01,126.0)
(1913,collied01,125.0)
(1940,willite01,134.0)
(1902,fultzda01,109.0)
(1902,hartsto01,109.0)
(1889,griffmi01,152.0)
(1889,stoveha01,152.0)
(1937,dimagjo01,151.0)
(1926,ruthba01,139.0)
(1878,highadi01,60.0)
(1980,wilsowi02,133.0)
(1951,kinerra01,124.0)
(1951,musiast01,124.0)
(1997,biggicr01,146.0)
(1968,beckegl01,98.0)
(1939,rolfere01,139.0)
(1931,gehrilo01,163.0)
(1908,mcintma01,105.0)
(2006,sizemgr01,134.0)
(1944,stirnsn01,125.0)
(1977,carewro01,128.0)
(1911,cobbty01,147.0)
(1962,robinfr02,134.0)
(1915,cobbty01,144.0)
(1960,mantlmi01,119.0)
(1896,burkeje01,160.0)
(1933,gehrilo01,138.0)
(1883,stoveha01,110.0)
(1942,willite01,141.0)
(1900,thomaro01,132.0)
(1890,duffyhu01,161.0)
(1953,snidedu01,132.0)
(1986,henderi01,130.0)
(1975,rosepe01,112.0)
(1898,mcgrajo01,143.0)
(1964,allendi01,125.0)
(2000,bagweje01,152.0)
(1881,gorege01,86.0)
(1988,boggswa01,128.0)
(1922,hornsro01,141.0)
(1971,brocklo01,126.0)
(2002,soriaal01,128.0)
(1979,baylodo01,120.0)
(1874,mcveyca01,91.0)
(1999,bagweje01,143.0)
(1892,childcu01,136.0)
(2008,ramirha01,125.0)
(2004,pujolal01,133.0)
(1982,molitpa01,136.0)
(1973,bondsbo01,131.0)
(1887,oneilti01,167.0)
(1885,stoveha01,130.0)
(1924,ruthba01,143.0)
(1917,bushdo01,112.0)
(1991,molitpa01,133.0)
(1935,galanau01,133.0)
(1920,ruthba01,158.0)
(1959,pinsova01,131.0)
(1919,ruthba01,103.0)
(1928,ruthba01,163.0)
(1995,biggicr01,123.0)
(2011,grandcu01,136.0)
(1948,henrito01,138.0)
(1993,dykstle01,143.0)
(1984,evansdw01,121.0)
(1946,willite01,142.0)
(1921,ruthba01,177.0)
(1893,longhe01,149.0)
(1990,henderi01,119.0)
(1934,gehrich01,134.0)
(2010,pujolal01,115.0)
(1918,grohhe01,86.0)
(1901,lajoina01,145.0)
(1996,burksel01,142.0)
(1963,aaronha01,121.0)
(1923,ruthba01,151.0)
(1932,kleinch01,152.0)
(1916,cobbty01,113.0)
(1877,orourji01,68.0)
(1958,mantlmi01,127.0)
(1978,lefloro01,126.0)
(2001,sosasa01,146.0)
(1936,gehrilo01,167.0)
(1974,rosepe01,110.0)
(1965,harpeto01,126.0)
(1965,versazo01,126.0)
(1947,mizejo01,137.0)
(1873,barnero01,125.0)
(1956,mantlmi01,132.0)
(1985,henderi01,146.0)
(1888,pinknge01,134.0)
(1994,thomafr04,106.0)
(2003,pujolal01,137.0)
(1884,dunlafr01,160.0)
(1970,willibi01,137.0)
(1967,aaronha01,113.0)
(1967,brocklo01,113.0)
(1871,barnero01,66.0)
(1910,mageesh01,110.0)
(1925,cuyleki01,144.0)
(2007,rodrial01,143.0)
(1938,greenha01,144.0)
(1981,henderi01,89.0)
(1929,hornsro01,156.0)
(1927,ruthba01,158.0)
(1905,donlimi01,124.0)
(1912,collied01,137.0)
(1914,collied01,122.0)
(1943,vaughar01,112.0)
(1983,raineti01,133.0)
(1972,morgajo02,122.0)
(1976,rosepe01,130.0)
(1969,jacksre01,123.0)
(1895,hamilbi01,166.0)
(1941,willite01,135.0)
(1945,stanked01,128.0)
(1998,sosasa01,134.0)
(1954,mantlmi01,129.0)
(1897,hamilbi01,152.0)
(1907,shannsp01,104.0)
(2005,pujolal01,129.0)
(1879,jonesch01,85.0)
(1903,beaumgi01,137.0)
(1930,kleinch01,158.0)
(1909,leachto01,126.0)
(2009,pujolal01,124.0)
(1952,hemusso01,105.0)
(1952,musiast01,105.0)
(1875,barnero01,115.0)
(1880,dalryab01,91.0)
(1899,keelewi01,140.0)
(1899,mcgrajo01,140.0)
(1949,willite01,150.0)
(1989,boggswa01,113.0)
(1961,mantlmi01,132.0)
(1961,marisro01,132.0)
(1891,brownto01,177.0)
(1987,raineti01,123.0)
(1992,phillto02,114.0)
(1886,kellyki01,155.0)
(1882,gorege01,99.0)
(1950,dimagdo01,131.0)
Map Reduce
(1871,barnero01,66.0)
(1872,eggleda01,94.0)
(1873,barnero01,125.0)
(1874,mcveyca01,91.0)
(1875,barnero01,115.0)
(1876,barnero01,126.0)
(1877,orourji01,68.0)
(1878,highadi01,60.0)
(1879,jonesch01,85.0)
(1880,dalryab01,91.0)
(1881,gorege01,86.0)
(1882,gorege01,99.0)
(1883,stoveha01,110.0)
(1884,dunlafr01,160.0)
(1885,stoveha01,130.0)
(1886,kellyki01,155.0)
(1887,oneilti01,167.0)
(1888,pinknge01,134.0)
(1889,griffmi01,152.0)
(1889,stoveha01,152.0)
(1890,duffyhu01,161.0)
(1891,brownto01,177.0)
(1892,childcu01,136.0)
(1893,longhe01,149.0)
(1894,hamilbi01,192.0)
(1895,hamilbi01,166.0)
(1896,burkeje01,160.0)
(1897,hamilbi01,152.0)
(1898,mcgrajo01,143.0)
(1899,keelewi01,140.0)
(1899,mcgrajo01,140.0)
(1900,thomaro01,132.0)
(1901,lajoina01,145.0)
(1902,hartsto01,109.0)
(1902,fultzda01,109.0)
(1903,beaumgi01,137.0)
(1904,brownge01,99.0)
(1905,donlimi01,124.0)
(1906,wagneho01,103.0)
(1906,chancfr01,103.0)
(1907,shannsp01,104.0)
(1908,mcintma01,105.0)
(1909,leachto01,126.0)
(1910,mageesh01,110.0)
(1911,cobbty01,147.0)
(1912,collied01,137.0)
(1913,collied01,125.0)
(1914,collied01,122.0)
(1915,cobbty01,144.0)
(1916,cobbty01,113.0)
(1917,bushdo01,112.0)
(1918,grohhe01,86.0)
(1919,ruthba01,103.0)
(1920,ruthba01,158.0)
(1921,ruthba01,177.0)
(1922,hornsro01,141.0)
(1923,ruthba01,151.0)
(1924,ruthba01,143.0)
(1925,cuyleki01,144.0)
(1926,ruthba01,139.0)
(1927,ruthba01,158.0)
(1928,ruthba01,163.0)
(1929,hornsro01,156.0)
(1930,kleinch01,158.0)
(1931,gehrilo01,163.0)
(1932,kleinch01,152.0)
(1933,gehrilo01,138.0)
(1934,gehrich01,134.0)
(1935,galanau01,133.0)
(1936,gehrilo01,167.0)
(1937,dimagjo01,151.0)
(1938,greenha01,144.0)
(1939,rolfere01,139.0)
(1940,willite01,134.0)
(1941,willite01,135.0)
(1942,willite01,141.0)
(1943,vaughar01,112.0)
(1944,stirnsn01,125.0)
(1945,stanked01,128.0)
(1946,willite01,142.0)
(1947,mizejo01,137.0)
(1948,henrito01,138.0)
(1949,willite01,150.0)
(1950,dimagdo01,131.0)
(1951,kinerra01,124.0)
(1951,musiast01,124.0)
(1952,hemusso01,105.0)
(1952,musiast01,105.0)
(1953,snidedu01,132.0)
(1954,mantlmi01,129.0)
(1955,snidedu01,126.0)
(1956,mantlmi01,132.0)
(1957,mantlmi01,121.0)
(1958,mantlmi01,127.0)
(1959,pinsova01,131.0)
(1960,mantlmi01,119.0)
(1961,marisro01,132.0)
(1961,mantlmi01,132.0)
(1962,robinfr02,134.0)
(1963,aaronha01,121.0)
(1964,allendi01,125.0)
(1965,harpeto01,126.0)
(1965,versazo01,126.0)
(1966,robinfr02,122.0)
(1966,aloufe01,122.0)
(1967,aaronha01,113.0)
(1967,brocklo01,113.0)
(1968,beckegl01,98.0)
(1969,jacksre01,123.0)
(1970,willibi01,137.0)
(1971,brocklo01,126.0)
(1972,morgajo02,122.0)
(1973,bondsbo01,131.0)
(1974,rosepe01,110.0)
(1975,rosepe01,112.0)
(1976,rosepe01,130.0)
(1977,carewro01,128.0)
(1978,lefloro01,126.0)
(1979,baylodo01,120.0)
(1980,wilsowi02,133.0)
(1981,henderi01,89.0)
(1982,molitpa01,136.0)
(1983,raineti01,133.0)
(1984,evansdw01,121.0)
(1985,henderi01,146.0)
(1986,henderi01,130.0)
(1987,raineti01,123.0)
(1988,boggswa01,128.0)
(1989,boggswa01,113.0)
(1990,henderi01,119.0)
(1991,molitpa01,133.0)
(1992,phillto02,114.0)
(1993,dykstle01,143.0)
(1994,thomafr04,106.0)
(1995,biggicr01,123.0)
(1996,burksel01,142.0)
(1997,biggicr01,146.0)
(1998,sosasa01,134.0)
(1999,bagweje01,143.0)
(2000,bagweje01,152.0)
(2001,sosasa01,146.0)
(2002,soriaal01,128.0)
(2003,pujolal01,137.0)
(2004,pujolal01,133.0)
(2005,pujolal01,129.0)
(2006,sizemgr01,134.0)
(2007,rodrial01,143.0)
(2008,ramirha01,125.0)
(2009,pujolal01,124.0)
(2010,pujolal01,115.0)
(2011,grandcu01,136.0)