load data local inpath "/usr/local/src/practice_code/hive/hive_udf/movies.csv" into movie_table
1 2 3 4 5 6 7 8 9
hive>select*from movie_table limit 5; OK movie_table.movieid movie_table.title movie_table.genres 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 2 Jumanji (1995) Adventure|Children|Fantasy 3 Grumpier Old Men (1995) Comedy|Romance 4 Waiting to Exhale (1995) Comedy|Drama|Romance 5 Father of the Bride Part II (1995) Comedy Time taken: 0.452 seconds, Fetched: 5row(s)
publicclassUppercaseextendsUDF { // 传入一个字符串,将小写变为大写,通过java 实现了hive 的不支持的功能 public Text evaluate(final Text s) { returnnewText(s.toString().toUpperCase()); } }
hive>add jar /usr/local/src/practice_code/hive/hive_udf/hive-1.0-SNAPSHOT-jar-with-dependencies.jar; Added [/usr/local/src/practice_code/hive/hive_udf/hive-1.0-SNAPSHOT-jar-with-dependencies.jar] to class path Added resources: [/usr/local/src/practice_code/hive/hive_udf/hive-1.0-SNAPSHOT-jar-with-dependencies.jar]
5.创建UDF函数
1 2 3
hive>create temporary function upper_func as'Uppercase'; OK Time taken: 0.039 seconds
hive>select title, upper_func(title) from movie_table limit 5; OK title _c1 Toy Story (1995) TOY STORY (1995) Jumanji (1995) JUMANJI (1995) Grumpier Old Men (1995) GRUMPIER OLD MEN (1995) Waiting to Exhale (1995) WAITING TO EXHALE (1995) Father of the Bride Part II (1995) FATHER OF THE BRIDE PART II (1995) Time taken: 0.092 seconds, Fetched: 5row(s)
hive>add file /usr/local/src/practice_code/hive/hive_udf/transform1.py; Added resources: [/usr/local/src/practice_code/hive/hive_udf/transform1.py] hive>select transform(movieId, title) using "python transform1.py" as (uuu) from movie_table limit 10; Query ID = root_20210617194446_f35d4eac-6e96-401d-a242-b8b6a918e1fe Total jobs =1 Launching Job 1outof1 Number of reduce tasks issetto0 since there's no reduce operator ... MapReduce Jobs Launched: Stage-Stage-1: Map: 1 Cumulative CPU: 1.87 sec HDFS Read: 261989 HDFS Write: 221 SUCCESS Total MapReduce CPU Time Spent: 1 seconds 870 msec OK pinjie 1-Toy Story (1995) 2-Jumanji (1995) 3-Grumpier Old Men (1995) 4-Waiting to Exhale (1995) 5-Father of the Bride Part II (1995) 6-Heat (1995) 7-Sabrina (1995) 8-Tom and Huck (1995) 9-Sudden Death (1995) 10-GoldenEye (1995) Time taken: 18.456 seconds, Fetched: 10 row(s)
注意:第一个transform是hive的关键字与文件无关系!
可以看到,这种方式启动了MR进行计算,由于只是输出,所以没有reducer。
案例3:使用linux自带awk命令创建udf函数
1.测试awk命令
1 2 3 4 5 6 7 8 9 10 11
(base) [root@main hive_udf]# head movies.csv | awk -F ',' '{print $1"_"$2}' movieId_title 1_Toy Story (1995) 2_Jumanji (1995) 3_Grumpier Old Men (1995) 4_Waiting to Exhale (1995) 5_Father of the Bride Part II (1995) 6_Heat (1995) 7_Sabrina (1995) 8_Tom and Huck (1995) 9_Sudden Death (1995)
hive>showcreatetable badou.docs; OK createtab_stmt CREATETABLE `badou.docs`( `line` string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' LOCATION 'hdfs://main:9000/user/hive/warehouse/badou.db/docs' TBLPROPERTIES ( 'COLUMN_STATS_ACCURATE'='true', 'numFiles'='1', 'numRows'='0', 'rawDataSize'='0', 'totalSize'='632207', 'transient_lastDdlTime'='1618880802') Time taken: 0.163 seconds, Fetched: 17row(s)
-- 表只有一个字段line,一行一篇文章。 hive>select line from badou.docs; Soames gave him a strange, sidelong stare. “My wife can see noone,” he muttered doggedly. Young Jolyon answered gently: “I shouldn’t keep her a minute.” Soames brushed by him and barred the way. “She can see noone,” he said again. Young Jolyon’s glance shot past him into the hall, and Soames turned. There in the drawing-room doorway stood Irene, her eyes were wild and eager, her lips were parted, her hands outstretched. In the sight ofboth men that light vanished from her face; her hands dropped to her sides; she stood like stone. Soames spun round, and met his visitor’s eyes, andat the look he saw in them, a sound like a snarl escaped him. He drew his lips back in the ghost of a smile. “This is my house,” he said; “I manage my own affairs. I’ve told you once — I tell you again; we arenotat home.” Andin young Jolyon’s face he slammed the door.
The End Time taken: 0.089 seconds, Fetched: 2866row(s)
defmapper_func(): # 设置一个正则匹配模式,用于匹配单词 pattern = re.compile(r'\w+') # hive读取的标准输出从标准输入进入 for line in sys.stdin: # 按空格拆分文章 sgw_list = line.strip().split(" ") # 验证,清理,输出一个单词 for word in sgw_list: # 如果匹配的单词长度小于1,继续 iflen(pattern.findall(word)) < 1: continue # 返回的是一个匹配列表 word = pattern.findall(word)[0].lower() word = word.strip() if word != '': print("%s\t1"%word)
defreducer(): sum = 0 word_cur = None for line in sys.stdin: ll = line.strip().split("\t") iflen(ll) != 2: # 如果不是(word, value)形式,忽略,继续循环 continue word, count = ll if word == ' 'or word == '': continue if word_cur == None: word_cur = word if word_cur != word: print('\t'.join([word_cur, str(sum)])) word_cur = word sum = 0 sum += int(count) # 会存在最后一个word无法输出,所以在for循环外加上一句输出,把最后一个word输出 print('\t'.join([word_cur, str(sum)])) if __name__ == "__main__": reducer()
hive> select transform(wc.word, wc.count) using "python reduce.py" as w, c > from > ( > select transform(line) using "python map.py" as word, count > from badou.docs > cluster by word)wc -- cluster by按照word进行分桶,方便后续进行reduce > limit 1000; Query ID = root_20210618184656_d016ca78-ece6-42b8-ad95-82e34c90fda6 Total jobs = 1 Launching Job 1 out of 1 Number of reduce tasks not specified. Estimated from input data size: 1 In order to change the average load for a reducer (in bytes): broadstairs 3 brocaded 1 broke 19 broken 20 brokenly 1 brokers 1 brompton 1 bronze 2 Time taken: 29.892 seconds, Fetched: 1000 row(s)
@Override public StructObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException { if (args.length != 1) { thrownewUDFArgumentLengthException("ExplodeMap takes only one argument"); } if (args[0].getCategory() != ObjectInspector.Category.PRIMITIVE) { thrownewUDFArgumentException("ExplodeMap takes string as a parameter"); }
hive> add jar /usr/local/src/practice_code/hive/hive_udf/hive-1.0-SNAPSHOT-jar-with-dependencies.jar; Added [/usr/local/src/practice_code/hive/hive_udf/hive-1.0-SNAPSHOT-jar-with-dependencies.jar] to class path Added resources: [/usr/local/src/practice_code/hive/hive_udf/hive-1.0-SNAPSHOT-jar-with-dependencies.jar]
hive> create temporary function explode_func as 'Expolde'; OK Time taken: 0.006 seconds
4.验证
1 2 3 4 5 6 7 8 9 10 11 12 13 14
hive>select explode_func(data) from badou.udtf_test_table; OK col1 col2 10.1 20.2 30.3 40.4 50.5 100.11 200.22 300.33 400.44 500.55 Time taken: 0.057 seconds, Fetched: 10row(s)