arraysjsonparsinghiveccd

Hive on JSON File


I need help in parsing the JSON file using HIVE. This file has nested arrays, when i tried to parse the file or query using the HiVE UDF i can drill down just to one level. Then next level arrays were coming up Null in my result. I have given the example below. File has couple of sections (array ), but below given one is most complex one. I tried to use get_json_object to parse, i was able to get data just one level, it didn't pull the nested arrays at all. It would be helpful if someone guide me in parsing the nested json arrays

"section": {
                        "moodCode": "xxx",
                        "classCode": "xxx",
                        "templateId": {
                            "root": "2.xx.840"
                        },
                        "code": {
                            "codeSystemName": "LOINC",

                        },
                        "title": "problems",
                        "text": {
                            "mediaType": "text/x-hl7-text+xml",
                            "list": [{
                                "caption": "Recorded",
                                "item": {
                                    "ID": "pr101",
                                    "content": [{
                                        "ID": "pr101-desc",
                                        "text": "Salm"
                                    },
                                    "003.1"],
                                    "text": "                                              "
                                },
                                "text": "                                        "
                            },
                            {
                                "caption": "Reported",
                                "item": "None Reported",
                                "text": "                                        "
                            }],
                            "text": "                                  "
                        },
                        "entry": {
                            "typeCode": "DRIV",
                            "act": {
                                "moodCode": "EVN",
                                "classCode": "ACT",
                                "templateId": [{
                                    "root": "2.16.840.1.0.1.27"
                                },
                                {
                                    "root": "1.3.6.1.4..1"
                                },
                                {
                                    "root": "1.3.6.1.4.4.5.2"
                                }],
                                "id": {
                                    "root": "068fd4d4-dfa2-48190768f"
                                },
                                "code": {
                                    "nullFlavor": "NA"
                                },
                                "statusCode": {
                                    "code": "completed"
                                },
                                "effectiveTime": {
                                    "low": {
                                        "value": "20140428144743+0100"
                                    },
                                    "high": {
                                        "value": "20140428144743+0100"
                                    },
                                    "text": "                                              "
                                },
                                "entryRelationship": {
                                    "typeCode": "SUBJ",
                                    "observation": {
                                        "moodCode": "EVN",
                                        "classCode": "OBS",
                                        "templateId": [{
                                            "root": "2.16.840.1.20.1.28"
                                        },
                                        {
                                            "root": "1.3.6.1.4.1.5.3.1.4.5"
                                        }],
                                        "id": {
                                            "root": "fa34b4da4dbb-b090-01bd4d6ef62b"
                                        },
                                        "code": {
                                            "codeSystemName": "SNOMED CT",
                                            "code": "282009",
                                            "displayName": "diagnosis",
                                            "codeSystem": "2.16.840.1.6.96"
                                        },
                                        "text": {
                                            "reference": {
                                                "value": "#pr101"
                                            },
                                            "text": "                                      "
                                        },
                                        "statusCode": {
                                            "code": "completed"
                                        },
                                        "effectiveTime": {
                                            "low": {
                                                "value": "20140428144743+0100"
                                            },
                                            "high": {
                                                "nullFlavor": "UNK"
                                            },
                                            "text": "                                                          "
                                        },
                                        "value": {
                                            "codeSystemName": "ICD-9",
                                            "xsi:type": "CD",
                                            "code": "003.1",
                                            "displayName": "Sla sia",
                                            "codeSystem": "2.16.840.1.103",
                                            "originalText": {
                                                "reference": {
                                                    "value": "#pr101-desc"
                                                },
                                                "text": "                                          "
                                            },
                                            "text": "                                      "
                                        },
                                        "entryRelationship": {
                                            "typeCode": "REFR",
                                            "observation": {
                                                "moodCode": "EVN",
                                                "classCode": "OBS",
                                                "templateId": [{
                                                    "root": "2.16.840..1.50"
                                                },
                                                {
                                                    "root": "2.16.8410.20.1.57"
                                                },
                                                {
                                                    "root": "1.3.6.13.1.4.1.1"
                                                }],
                                                "code": {
                                                    "codeSystemName": "LOINC",
                                                    "code": "33999-4",
                                                    "displayName": "Status",
                                                    "codeSystem": "2.16.840.1.113883.6.1"
                                                },
                                                "statusCode": {
                                                    "code": "completed"
                                                },
                                                "value": {
                                                    "codeSystemName": " CT",
                                                    "xsi:type": "CE",
                                                    "code": "55563",
                                                    "displayName": "active",
                                                    "codeSystem": "2.16.8406.96"
                                                },
                                                "text": "                                                                                                                                                        "
                                            },
                                            "text": "                                      "
                                        },
                                        "text": "                                                                                                                                                                                  "
                                    },
                                    "text": "                              "
                                },
                                "text": "                                                                                                                            "
                            },
                            "text": "                      "
                        },
                        "text": "                                                          "
                    },
                    "text": "              "
                },

When i used get_json_object below mentioned data came up null.

"title": "problems",
                        "text": {
                            "mediaType": "text/x-hl7-text+xml",
                            "list": [{
                                "caption": "Recorded",
                                "item": {
                                    "ID": "pr101",
                                    "content": [{
                                        "ID": "pr101-desc",
                                        "text": "Salm"
                                    },
                                    "003.1"],

Solution

  • updated-you have to format your JSON file in such a way that each record should be in just one line, like

    '{"section":{"moodCode":"xxx","classCode":"xxx","templateId":{"root":"2.xx.840"},"code":{"codeSystemName":"LOINC"},"title":"problems","text":{"mediaType":"text/x-hl7-text+xml","list":[{"caption":"Recorded","item":{"ID":"pr101","content":[{"ID":"pr101-desc","text":"Salm"},"003.1"],"text":"                                              "},"text":"                                        "},{"caption":"Reported","item":"None Reported","text":"                                        "}],"text":"                                  "},"entry":{"typeCode":"DRIV","act":{"moodCode":"EVN","classCode":"ACT","templateId":[{"root":"2.16.840.1.0.1.27"},{"root":"1.3.6.1.4..1"},{"root":"1.3.6.1.4.4.5.2"}],"id":{"root":"068fd4d4-dfa2-48190768f"},"code":{"nullFlavor":"NA"},"statusCode":{"code":"completed"},"effectiveTime":{"low":{"value":"20140428144743+0100"},"high":{"value":"20140428144743+0100"},"text":""},"entryRelationship":{"typeCode":"SUBJ","observation":{"moodCode":"EVN","classCode":"OBS","templateId":[{"root":"2.16.840.1.20.1.28"},{"root":"1.3.6.1.4.1.5.3.1.4.5"}],"id":{"root":"fa34b4da4dbb-b090-01bd4d6ef62b"},"code":{"codeSystemName":"SNOMEDCT","code":"282009","displayName":"diagnosis","codeSystem":"2.16.840.1.6.96"},"text":"","statusCode":{"code":"completed"},"effectiveTime":{"low":{"value":"20140428144743+0100"},"high":{"nullFlavor":"UNK"},"text":""},"value":{"codeSystemName":"ICD-9","xsi: type":"CD","code":"003.1","displayName":"Slasia","codeSystem":"2.16.840.1.103","originalText":{"reference":{"value":"#pr101-desc"},"text":""},"text":""},"entryRelationship":{"typeCode":"REFR","observation":{"moodCode":"EVN","classCode":"OBS","templateId":[{"root":"2.16.840..1.50"},{"root":"2.16.8410.20.1.57"},{"root":"1.3.6.13.1.4.1.1"}],"code":{"codeSystemName":"LOINC","code":"33999-4","displayName":"Status","codeSystem":"2.16.840.1.113883.6.1"},"statusCode":{"code":"completed"},"value":{"codeSystemName":"CT","xsi: type":"CE","code":"55563","displayName":"active","codeSystem":"2.16.8406.96"},"text":""},"text":""}},"text":""},"text":""},"text":""}},"text":""}'
    

    Now create a table treating the entire string as one column

    'drop table json_test;
    create external table json_test(value string)
    LOCATION 'path'; '
    

    you can lateral view json_tuple to get the fields you need now.

    'set hive.cli.print.header=true;
     SELECT c.moodCode,c.classCode,d.root,c.title,e.mediaType FROM json_test a  LATERAL VIEW json_tuple(a.value, 'section') b AS section LATERAL VIEW   json_tuple(b.section,'moodCode','classCode','templateId','title','text')c
     AS moodCode,classCode,templateId,title,text LATERAL VIEW json_tuple(c.templateId,'root')d
     AS root LATERAL VIEW json_tuple(c.text,'mediaType')e AS mediaType;'
    

    result

    'c.moodcode|c.classcode|d.root   |c.title |e.mediatype
       xxx     | xxx       |2.xx.840 |problems|text/x-hl7-text+xml'