...
Code Block | ||||||
---|---|---|---|---|---|---|
| ||||||
-- load up the base User Defined Function library (piggybank) and get a handle on the REPLACE function register /user/hue/shared/pig/udfs/piggybank.jar; define REPLACE org.apache.pig.piggybank.evaluation.string.REPLACE(); -- load the salary file and declare its structure inputFile = LOAD '/user/hue/vzSeminar/openGeorgia/salaryTravelReport-SMALL.csv' using org.apache.pig.piggybank.storage.CSVExcelStorage() as (name:chararray, title:chararray, salary:chararray, travel:chararray, orgType:chararray, org:chararray, year:int); -- loop thru the input data to clean up the number fields a bit cleanedUpNumbers = foreach inputFile GENERATE name as name, title as title, (float)REPLACE(salary, ',','') as salary, -- take out the commas and cast to a float (float)REPLACE(travel, ',','') as travel, -- take out the commas and cast to a float orgType as orgType, org as org, year as year; -- trim down to just Local Boards of Education onlySchoolBoards = filter cleanedUpNumbers by orgType == 'LBOE'; -- further trim it down to just be for the year in question onlySchoolBoardsFor2010 = filter onlySchoolBoards by year == 2010; -- bucket them up by the job title byTitle = GROUP onlySchoolBoardsFor2010 BY title; -- loop through the titles and for each one... salaryBreakdown = FOREACH byTitle GENERATE group as title, -- we grouped on this above COUNT(onlySchoolBoardsFor2010), -- how many people with this title MIN(onlySchoolBoardsFor2010.salary), -- determine the min MAX(onlySchoolBoardsFor2010.salary), -- determine the max AVG(onlySchoolBoardsFor2010.salary); -- determine the avg -- guarantee the order on the way out sortedSalaryBreakdown = ORDER salaryBreakdown by title; dump sortedSalaryBreakdown; -- save results back to HDFS --STORE sortedSalaryBreakdown into '/user/hue/vzSeminar/openGeorgia/titleBreakdownForSchoolsIn2010.txt'; |
Sample output. UPDATE IT WITH FULL DATA VALUES!!
Code Block | ||
---|---|---|
| ||
(ADAPTED PHYS ED TEACHER,35,19384.24,96320.194,24349.45,64689.09,43712.0302734375) (ADULT EDUCATION DIRECTOR/COORD,193,1033.60,99918.006,74164.01,37078.07238769531) (ADULT EDUCATION TEACHER,639,101203002.790,9987.29) (AFTER-SCHOOL PROGRAM WORKER,2,624.04,78493.98) (ALTERNATIVE SCHOOL DIRECTOR,2,111199.80,127149.1252424.72,16903.59646267361) (ASSISTANT PRINCIPAL,43638,1000086243.96,99928.92) (ATHLETICS DIRECTOR,1,12278925,100989.04,12278977926.046794819079) (ATTENDANCE WORKER,121,103007742.0409,9573.33) (AUDIOLOGIST,9,102240.46,81184.247742.09,7742.08984375) (AUDITOR,1,1183317.92,5021483317.0892,8381183317.88921875) (BOOKKEEPER,11811,10650.44,93.3833484.04,26734.2587890625) (BUS DRIVER,49,1321767.68,1012550915.3481,993520278.17801754075655) (BUSINESS SERV SECRETARY/CLERK,26,224789.99,100072080.000,98926443.99907658503605) (CENTRAL SUPPORT CLERK,41,23392.33,23392.33,7478523392.12330078125) (CONSTRUCTION MANAGER,1,833890.67,10102533890.9967,8089933890.57671875) (CROSSING GUARD,13,100528.09,10745384.3909,9403670.854876755934497) (CROSSROADS,17,30390.72,77269.82) (CROSSROADS ALT SCHOOL TEACHER,16,37425.80,71386.92) (CUSTODIAL PERSONNEL,1393,10099.11,9915.36CUSTODIAL PERSONNEL,132,3803.52,46626.88,26624.915542140152) (DEPUTY/ASSOC/ASSISTANT SUPT,2,108993.76,161051.34,135022.55078125) (DIAGNOSTICIAN,4,59711.15,69417.72,65523.5888671875) (DIRECTOR OF CURRICULUM/INSTR,6,31162.52,188324.23,98002.13346354167) |