I need help in parsing the JSON file using HIVE. This file has nested arrays, when i tried to parse the file or query using the HiVE UDF i can drill down just to one level. Then next level arrays were coming up Null in my result. I have given the example below. File has couple of sections (array ), but below given one is most complex one. I tried to use get_json_object to parse, i was able to get data just one level, it didn't pull the nested arrays at all. It would be helpful if someone guide me in parsing the nested json arrays
"section": {
"moodCode": "xxx",
"classCode": "xxx",
"templateId": {
"root": "2.xx.840"
},
"code": {
"codeSystemName": "LOINC",
},
"title": "problems",
"text": {
"mediaType": "text/x-hl7-text+xml",
"list": [{
"caption": "Recorded",
"item": {
"ID": "pr101",
"content": [{
"ID": "pr101-desc",
"text": "Salm"
},
"003.1"],
"text": " "
},
"text": " "
},
{
"caption": "Reported",
"item": "None Reported",
"text": " "
}],
"text": " "
},
"entry": {
"typeCode": "DRIV",
"act": {
"moodCode": "EVN",
"classCode": "ACT",
"templateId": [{
"root": "2.16.840.1.0.1.27"
},
{
"root": "1.3.6.1.4..1"
},
{
"root": "1.3.6.1.4.4.5.2"
}],
"id": {
"root": "068fd4d4-dfa2-48190768f"
},
"code": {
"nullFlavor": "NA"
},
"statusCode": {
"code": "completed"
},
"effectiveTime": {
"low": {
"value": "20140428144743+0100"
},
"high": {
"value": "20140428144743+0100"
},
"text": " "
},
"entryRelationship": {
"typeCode": "SUBJ",
"observation": {
"moodCode": "EVN",
"classCode": "OBS",
"templateId": [{
"root": "2.16.840.1.20.1.28"
},
{
"root": "1.3.6.1.4.1.5.3.1.4.5"
}],
"id": {
"root": "fa34b4da4dbb-b090-01bd4d6ef62b"
},
"code": {
"codeSystemName": "SNOMED CT",
"code": "282009",
"displayName": "diagnosis",
"codeSystem": "2.16.840.1.6.96"
},
"text": {
"reference": {
"value": "#pr101"
},
"text": " "
},
"statusCode": {
"code": "completed"
},
"effectiveTime": {
"low": {
"value": "20140428144743+0100"
},
"high": {
"nullFlavor": "UNK"
},
"text": " "
},
"value": {
"codeSystemName": "ICD-9",
"xsi:type": "CD",
"code": "003.1",
"displayName": "Sla sia",
"codeSystem": "2.16.840.1.103",
"originalText": {
"reference": {
"value": "#pr101-desc"
},
"text": " "
},
"text": " "
},
"entryRelationship": {
"typeCode": "REFR",
"observation": {
"moodCode": "EVN",
"classCode": "OBS",
"templateId": [{
"root": "2.16.840..1.50"
},
{
"root": "2.16.8410.20.1.57"
},
{
"root": "1.3.6.13.1.4.1.1"
}],
"code": {
"codeSystemName": "LOINC",
"code": "33999-4",
"displayName": "Status",
"codeSystem": "2.16.840.1.113883.6.1"
},
"statusCode": {
"code": "completed"
},
"value": {
"codeSystemName": " CT",
"xsi:type": "CE",
"code": "55563",
"displayName": "active",
"codeSystem": "2.16.8406.96"
},
"text": " "
},
"text": " "
},
"text": " "
},
"text": " "
},
"text": " "
},
"text": " "
},
"text": " "
},
"text": " "
},
When i used get_json_object below mentioned data came up null
.
"title": "problems",
"text": {
"mediaType": "text/x-hl7-text+xml",
"list": [{
"caption": "Recorded",
"item": {
"ID": "pr101",
"content": [{
"ID": "pr101-desc",
"text": "Salm"
},
"003.1"],
updated-you have to format your JSON file in such a way that each record should be in just one line, like
'{"section":{"moodCode":"xxx","classCode":"xxx","templateId":{"root":"2.xx.840"},"code":{"codeSystemName":"LOINC"},"title":"problems","text":{"mediaType":"text/x-hl7-text+xml","list":[{"caption":"Recorded","item":{"ID":"pr101","content":[{"ID":"pr101-desc","text":"Salm"},"003.1"],"text":" "},"text":" "},{"caption":"Reported","item":"None Reported","text":" "}],"text":" "},"entry":{"typeCode":"DRIV","act":{"moodCode":"EVN","classCode":"ACT","templateId":[{"root":"2.16.840.1.0.1.27"},{"root":"1.3.6.1.4..1"},{"root":"1.3.6.1.4.4.5.2"}],"id":{"root":"068fd4d4-dfa2-48190768f"},"code":{"nullFlavor":"NA"},"statusCode":{"code":"completed"},"effectiveTime":{"low":{"value":"20140428144743+0100"},"high":{"value":"20140428144743+0100"},"text":""},"entryRelationship":{"typeCode":"SUBJ","observation":{"moodCode":"EVN","classCode":"OBS","templateId":[{"root":"2.16.840.1.20.1.28"},{"root":"1.3.6.1.4.1.5.3.1.4.5"}],"id":{"root":"fa34b4da4dbb-b090-01bd4d6ef62b"},"code":{"codeSystemName":"SNOMEDCT","code":"282009","displayName":"diagnosis","codeSystem":"2.16.840.1.6.96"},"text":"","statusCode":{"code":"completed"},"effectiveTime":{"low":{"value":"20140428144743+0100"},"high":{"nullFlavor":"UNK"},"text":""},"value":{"codeSystemName":"ICD-9","xsi: type":"CD","code":"003.1","displayName":"Slasia","codeSystem":"2.16.840.1.103","originalText":{"reference":{"value":"#pr101-desc"},"text":""},"text":""},"entryRelationship":{"typeCode":"REFR","observation":{"moodCode":"EVN","classCode":"OBS","templateId":[{"root":"2.16.840..1.50"},{"root":"2.16.8410.20.1.57"},{"root":"1.3.6.13.1.4.1.1"}],"code":{"codeSystemName":"LOINC","code":"33999-4","displayName":"Status","codeSystem":"2.16.840.1.113883.6.1"},"statusCode":{"code":"completed"},"value":{"codeSystemName":"CT","xsi: type":"CE","code":"55563","displayName":"active","codeSystem":"2.16.8406.96"},"text":""},"text":""}},"text":""},"text":""},"text":""}},"text":""}'
Now create a table treating the entire string as one column
'drop table json_test;
create external table json_test(value string)
LOCATION 'path'; '
you can lateral view json_tuple to get the fields you need now.
'set hive.cli.print.header=true;
SELECT c.moodCode,c.classCode,d.root,c.title,e.mediaType FROM json_test a LATERAL VIEW json_tuple(a.value, 'section') b AS section LATERAL VIEW json_tuple(b.section,'moodCode','classCode','templateId','title','text')c
AS moodCode,classCode,templateId,title,text LATERAL VIEW json_tuple(c.templateId,'root')d
AS root LATERAL VIEW json_tuple(c.text,'mediaType')e AS mediaType;'
result
'c.moodcode|c.classcode|d.root |c.title |e.mediatype
xxx | xxx |2.xx.840 |problems|text/x-hl7-text+xml'