Wednesday 4 March 2020

IBM BigData


CREATE TABLE feedback_vnr(comments STRING);
load data LOCAL INPATH '/home/cloudera/Desktop/file.txt' INTO TABLE feedback_vnr;
select * from feedback_vnr;
select split(comments,' ') FROM feedback_vnr;
select explode(split(comments,' ')) FROM feedback_vnr
select word,count(*) from (select explode(split(comments,' ')) as word from feedback_vnr)  tmp GROUP BY word;

___________________________

A = load '/user/cloudera/55';
B = foreach A generate flatten(TOKENIZE((chararray)$0)) as word;
C = filter B by word matches '\\w+';
D = group C by word;
E = foreach D generate COUNT(C),group;
store E into '/user/cloudera/n66';
____________________________



 Most occurred first character in the word of a  file


lines  = LOAD '/user/cloudera/my-friends' AS (line: chararray); 
tokens = FOREACH lines GENERATE flatten(TOKENIZE(line)) As token:chararray;
letters = FOREACH tokens  GENERATE SUBSTRING(token,0,1) As letter:chararray;
lettergrp = GROUP letters by letter; 
countletter  = FOREACH  lettergrp  GENERATE group,COUNT(letters);
OrderCnt = ORDER countletter BY $1 DESC; 
result = LIMIT OrderCnt 1;
STORE result into '/user/cloudera/dummy5556777777';

____________________________________



tier1.sources  = source1
tier1.channels = channel1
tier1.sinks    = sink1 

tier1.sources.source1.type     = netcat
tier1.sources.source1.bind     = 127.0.0.1
tier1.sources.source1.port     = 44444
tier1.sources.source1.channels = channel1

tier1.channels.channel1.type   = memory
tier1.channels.channel1.capacity = 100

tier1.sinks.sink1.type= HDFS
tier1.sinks.sink1.fileType=DataStream
tier1.sinks.sink1.channel      = channel1

tier1.sinks.sink1.hdfs.path = hdfs://localhost:8020/user/cloudera/flume/events_manish_rvim

No comments:

Post a Comment