phplarge-datalarge-filesjsonstream

Parse Large (100MB) JSON with PHP using salsify's JsonStreamerParser in PHPProBid


I have a JSON file:

curl https://api.mercadolibre.com/sites/MLM/categories/all  > categoriesMLM.gz

which contains an object of objects (about 60000+). I've also installed https://github.com/salsify/jsonstreamingparser via composer to save every item into a database in PHPProBid as written in the following code:

public function SaveCategoryByStream(){
    require_once dirname(__FILE__).'/../../../../../../vendor/autoload.php';
    ini_set('memory_limit', '4024M');
    ini_set('max_execution_time', 0);
    $testfile = '/home/richi/Desktop/categoriesMLM.json';

    $listener = new \JsonStreamingParser\Listener\GeoJsonListener(function ($category) {
        
        $category_name = Array(
            $category['name']
        );
        $category['id'] = preg_replace('/[^0-9]/', '', $category['id']);
        $id = $order_id = Array(
            $category['id']
        );
        $parent_id = $category['path_from_root'];
        if($parent_id == null){

        }else{
            $parent_id = end($category['path_from_root']);
            $parent_id = prev($category['path_from_root']);
            $parent_id = preg_replace('/[^0-9]/', '', $parent_id['id']);
        }
        $new_category = Array(
            'parent_id' => $parent_id,
            'id' => $id,
            'order_id' => $order_id,
            'name' => $category_name,
            'full_name' => $category_name
        );
        try {
            $categoriesService = new Service\Table\Relational\Categories();
            $categoriesService->save($new_category);
            $parent_id = $id;//['id'];
            //Saving all children categories of that specific category
            for($ii = 0; $ii < count($category['children_categories']); $ii++){
                $this->SaveChildrenCategory($parent_id, $category['children_categories'][$ii], $categoriesService);
            }
        }catch(Exception $e){
            echo $e;
        }
    });
    $stream = fopen($testfile, 'r');
    try {
        $parser = new \JsonStreamingParser\Parser($stream, $listener);
        $parser->parse();
        fclose($stream);
    } catch (Exception $e) {
        fclose($stream);
        throw $e;
    }

    $controller = 'Mercadolibre';
    $headline = $this->_('MeliSync');
    $filter = 'first_time';
    return array(
        'controller' => $controller,
        'headline'   => $headline,
        'messages'   => $this->_flashMessenger->getMessages(),
        'filter'     => $filter
    );

}

Once decompressed, the JSON file looks something like this:

{
  "MLM5320": {
    "id": "MLM5320",
    "name": "Accesorios para Autos",
    "picture": "http://resources.mlstatic.com/category/images/a076507d-2b2a-43a8-b08a-86a2197c3a82.png",
    "permalink": null,
    "total_items_in_this_category": 0,
    "path_from_root": [
      {
        "id": "MLM1747",
        "name": "Accesorios para Vehículos"
      },
      {
        "id": "MLM5320",
        "name": "Accesorios para Autos"
      }
    ],
    "children_categories": [
      {
        "id": "MLM92132",
        "name": "Accesorios para el Exterior",
        "total_items_in_this_category": 119421
      },
      {
        "id": "MLM92130",
        "name": "Accesorios para el Interior",
        "total_items_in_this_category": 152424
      },
      {
        "id": "MLM92140",
        "name": "Otros",
        "total_items_in_this_category": 0
      }
    ],
    "attribute_types": "attributes",
    "settings": {
      "adult_content": false,
      "buying_allowed": true,
      "buying_modes": [
        "buy_it_now",
        "auction"
      ],
      "catalog_domain": null,
      "coverage_areas": "not_allowed",
      "currencies": [
        "USD",
        "MXN"
      ],
      "fragile": false,
      "immediate_payment": "required",
      "item_conditions": [
        "not_specified",
        "new",
        "used"
      ],
      "items_reviews_allowed": false,
      "listing_allowed": false,
      "max_description_length": 50000,
      "max_pictures_per_item": 12,
      "max_pictures_per_item_var": 10,
      "max_sub_title_length": 70,
      "max_title_length": 60,
      "maximum_price": null,
      "minimum_price": null,
      "mirror_category": null,
      "mirror_master_category": null,
      "mirror_slave_categories": [
      ],
      "price": "required",
      "reservation_allowed": "not_allowed",
      "restrictions": [
      ],
      "rounded_address": false,
      "seller_contact": "not_allowed",
      "shipping_modes": [
        "me1",
        "not_specified",
        "custom"
      ],
      "shipping_options": [
        "carrier",
        "custom"
      ],
      "shipping_profile": "optional",
      "show_contact_information": false,
      "simple_shipping": "optional",
      "stock": "required",
      "sub_vertical": null,
      "subscribable": false,
      "tags": [
      ],
      "vertical": null,
      "vip_subdomain": "articulo"
    },
    "meta_categ_id": null,
    "attributable": false
  },"MLM187169": {
    "id": "MLM187169",
    "name": "Alimentos para Bebés",
    "picture": null,
    "permalink": null,
    "total_items_in_this_category": 1752,
    "path_from_root": [
      {
        "id": "MLM1403",
        "name": "Alimentos y Bebidas"
      },
      {
        "id": "MLM187169",
        "name": "Alimentos para Bebés"
      }
    ],
    "children_categories": [
      {
        "id": "MLM189061",
        "name": "Leche",
        "total_items_in_this_category": 934
      },
      {
        "id": "MLM189062",
        "name": "Papilla",
        "total_items_in_this_category": 193
      },
      {
        "id": "MLM189060",
        "name": "Otros",
        "total_items_in_this_category": 619
      }
    ],
    "attribute_types": "attributes",
    "settings": {
      "adult_content": false,
      "buying_allowed": true,
      "buying_modes": [
        "auction",
        "buy_it_now"
      ],
      "catalog_domain": null,
      "coverage_areas": "not_allowed",
      "currencies": [
        "USD",
        "MXN"
      ],
      "fragile": false,
      "immediate_payment": "required",
      "item_conditions": [
        "used",
        "not_specified",
        "new"
      ],
      "items_reviews_allowed": false,
      "listing_allowed": false,
      "max_description_length": 50000,
      "max_pictures_per_item": 12,
      "max_pictures_per_item_var": 10,
      "max_sub_title_length": 70,
      "max_title_length": 60,
      "maximum_price": null,
      "minimum_price": null,
      "mirror_category": null,
      "mirror_master_category": "MLM39965",
      "mirror_slave_categories": [
      ],
      "price": "required",
      "reservation_allowed": "not_allowed",
      "restrictions": [
      ],
      "rounded_address": false,
      "seller_contact": "not_allowed",
      "shipping_modes": [
        "custom",
        "not_specified",
        "me2",
        "me1"
      ],
      "shipping_options": [
        "carrier",
        "custom"
      ],
      "shipping_profile": "optional",
      "show_contact_information": false,
      "simple_shipping": "optional",
      "stock": "required",
      "sub_vertical": null,
      "subscribable": true,
      "tags": [
      ],
      "vertical": null,
      "vip_subdomain": "articulo"
    },
    "meta_categ_id": null,
    "attributable": false
  },
  "MLM1403": {
    "id": "MLM1403",
    "name": "Alimentos y Bebidas",
    "picture": "http://resources.mlstatic.com/category/images/57ad1885-5f74-4ac0-b54c-d0b4c575a371.png",
    "permalink": "http://home.mercadolibre.com.mx/alimentos-bebidas/",
    "total_items_in_this_category": 72668,
    "path_from_root": [
      {
        "id": "MLM1403",
        "name": "Alimentos y Bebidas"
      }
    ],
    "children_categories": [
      {
        "id": "MLM187169",
        "name": "Alimentos para Bebés",
        "total_items_in_this_category": 1584
      },
      {
        "id": "MLM189607",
        "name": "Bebidas con Alcohol",
        "total_items_in_this_category": 4593
      },
      {
        "id": "MLM178700",
        "name": "Bebidas sin Alcohol",
        "total_items_in_this_category": 12501
      },
      {
        "id": "MLM1423",
        "name": "Comestibles",
        "total_items_in_this_category": 37780
      },
      {
        "id": "MLM187587",
        "name": "Kit de Preparación de Cerveza",
        "total_items_in_this_category": 300
      },
      {
        "id": "MLM191759",
        "name": "Lúpulo",
        "total_items_in_this_category": 0
      },
      {
        "id": "MLM189225",
        "name": "Yerba Mate",
        "total_items_in_this_category": 195
      },
      {
        "id": "MLM1417",
        "name": "Otros",
        "total_items_in_this_category": 15146
      }
    ],
    "attribute_types": "attributes",
    "settings": {
      "adult_content": false,
      "buying_allowed": true,
      "buying_modes": [
        "buy_it_now",
        "auction"
      ],
      "catalog_domain": null,
      "coverage_areas": "not_allowed",
      "currencies": [
        "USD",
        "MXN"
      ],
      "fragile": false,
      "immediate_payment": "required",
      "item_conditions": [
        "used",
        "not_specified",
        "new"
      ],
      "items_reviews_allowed": false,
      "listing_allowed": false,
      "max_description_length": 50000,
      "max_pictures_per_item": 12,
      "max_pictures_per_item_var": 10,
      "max_sub_title_length": 70,
      "max_title_length": 60,
      "maximum_price": null,
      "minimum_price": null,
      "mirror_category": null,
      "mirror_master_category": null,
      "mirror_slave_categories": [
      ],
      "price": "required",
      "reservation_allowed": "not_allowed",
      "restrictions": [
      ],
      "rounded_address": false,
      "seller_contact": "not_allowed",
      "shipping_modes": [
        "me2",
        "not_specified",
        "me1",
        "custom"
      ],
      "shipping_options": [
        "carrier",
        "custom"
      ],
      "shipping_profile": "optional",
      "show_contact_information": false,
      "simple_shipping": "optional",
      "stock": "required",
      "sub_vertical": null,
      "subscribable": false,
      "tags": [
      ],
      "vertical": null,
      "vip_subdomain": "articulo"
    },
    "meta_categ_id": null,
    "attributable": false
  },
  "MLM92150": {
    "id": "MLM92150",
    "name": "Accesorios para Cuatrimotos",
    "picture": "http://resources.mlstatic.com/category/images/61189af5-4e70-45b6-89fa-95339ccc2166.png",
    "permalink": null,
    "total_items_in_this_category": 2360,
    "path_from_root": [
      {
        "id": "MLM1747",
        "name": "Accesorios para Vehículos"
      },
      {
        "id": "MLM92150",
        "name": "Accesorios para Cuatrimotos"
      }
    ],
    "children_categories": [
      {
        "id": "MLM165695",
        "name": "Caja",
        "total_items_in_this_category": 136
      },
      {
        "id": "MLM189905",
        "name": "Fundas",
        "total_items_in_this_category": 52
      },
      {
        "id": "MLM165696",
        "name": "Rampas",
        "total_items_in_this_category": 69
      },
      {
        "id": "MLM165694",
        "name": "Ropa",
        "total_items_in_this_category": 165
      },
      {
        "id": "MLM165697",
        "name": "Winch",
        "total_items_in_this_category": 451
      },
      {
        "id": "MLM165698",
        "name": "Otros",
        "total_items_in_this_category": 1486
      }
    ],
    "attribute_types": "attributes",
    "settings": {
      "adult_content": false,
      "buying_allowed": true,
      "buying_modes": [
        "buy_it_now",
        "auction"
      ],
      "catalog_domain": null,
      "coverage_areas": "not_allowed",
      "currencies": [
        "MXN",
        "USD"
      ],
      "fragile": false,
      "immediate_payment": "required",
      "item_conditions": [
        "not_specified",
        "used",
        "new"
      ],
      "items_reviews_allowed": false,
      "listing_allowed": false,
      "max_description_length": 50000,
      "max_pictures_per_item": 12,
      "max_pictures_per_item_var": 10,
      "max_sub_title_length": 70,
      "max_title_length": 60,
      "maximum_price": null,
      "minimum_price": null,
      "mirror_category": null,
      "mirror_master_category": null,
      "mirror_slave_categories": [
      ],
      "price": "required",
      "reservation_allowed": "not_allowed",
      "restrictions": [
      ],
      "rounded_address": false,
      "seller_contact": "not_allowed",
      "shipping_modes": [
        "custom",
        "me1",
        "not_specified"
      ],
      "shipping_options": [
        "carrier",
        "custom"
      ],
      "shipping_profile": "optional",
      "show_contact_information": false,
      "simple_shipping": "optional",
      "stock": "required",
      "sub_vertical": null,
      "subscribable": false,
      "tags": [
      ],
      "vertical": null,
      "vip_subdomain": "articulo"
    },
    "meta_categ_id": null,
    "attributable": false
  },
  "MLM189061": {
    "id": "MLM189061",
    "name": "Leche",
    "picture": null,
    "permalink": null,
    "total_items_in_this_category": 114,
    "path_from_root": [
      {
        "id": "MLM1403",
        "name": "Alimentos y Bebidas"
      },
      {
        "id": "MLM187169",
        "name": "Alimentos para Bebés"
      },
      {
        "id": "MLM189061",
        "name": "Leche"
      }
    ],
    "children_categories": [
    ],
    "attribute_types": "attributes",
    "settings": {
      "adult_content": false,
      "buying_allowed": true,
      "buying_modes": [
        "auction",
        "buy_it_now"
      ],
      "catalog_domain": "MLM-BABIES_FORMULA",
      "coverage_areas": "not_allowed",
      "currencies": [
        "USD",
        "MXN"
      ],
      "fragile": false,
      "immediate_payment": "required",
      "item_conditions": [
        "used",
        "not_specified",
        "new"
      ],
      "items_reviews_allowed": false,
      "listing_allowed": true,
      "max_description_length": 50000,
      "max_pictures_per_item": 12,
      "max_pictures_per_item_var": 10,
      "max_sub_title_length": 70,
      "max_title_length": 60,
      "maximum_price": null,
      "minimum_price": null,
      "mirror_category": "MLM189058",
      "mirror_master_category": "MLM189058",
      "mirror_slave_categories": [
      ],
      "price": "required",
      "reservation_allowed": "not_allowed",
      "restrictions": [
      ],
      "rounded_address": false,
      "seller_contact": "not_allowed",
      "shipping_modes": [
        "custom",
        "not_specified",
        "me2",
        "me1"
      ],
      "shipping_options": [
        "carrier",
        "custom"
      ],
      "shipping_profile": "optional",
      "show_contact_information": false,
      "simple_shipping": "optional",
      "stock": "required",
      "sub_vertical": null,
      "subscribable": true,
      "tags": [
      ],
      "vertical": null,
      "vip_subdomain": "articulo"
    },
    "meta_categ_id": null,
    "attributable": false
  },
  "MLM1071": {
    "id": "MLM1071",
    "name": "Animales y Mascotas",
    "picture": "http://resources.mlstatic.com/category/images/0b18438b-f56c-421a-a020-f581f40f4c24.png",
    "permalink": "http://home.mercadolibre.com.mx/animales/",
    "total_items_in_this_category": 0,
    "path_from_root": [
      {
        "id": "MLM1071",
        "name": "Animales y Mascotas"
      }
    ],
    "children_categories": [
      {
        "id": "MLM1100",
        "name": "Aves",
        "total_items_in_this_category": 0
      },
      {
        "id": "MLM189310",
        "name": "Conejos",
        "total_items_in_this_category": 4006
      },
      {
        "id": "MLM1117",
        "name": "Equinos",
        "total_items_in_this_category": 4128
      },
      {
        "id": "MLM1081",
        "name": "Gatos",
        "total_items_in_this_category": 35580
      },
      {
        "id": "MLM3615",
        "name": "Libros de animales",
        "total_items_in_this_category": 10917
      },
      {
        "id": "MLM1091",
        "name": "Peces",
        "total_items_in_this_category": 2218
      },
      {
        "id": "MLM1072",
        "name": "Perros",
        "total_items_in_this_category": 0
      },
      {
        "id": "MLM1111",
        "name": "Reptiles y Anfibios",
        "total_items_in_this_category": 420
      },
      {
        "id": "MLM1105",
        "name": "Roedores",
        "total_items_in_this_category": 8093
      },
      {
        "id": "MLM1126",
        "name": "Otros",
        "total_items_in_this_category": 2736
      }
    ],
    "attribute_types": "attributes",
    "settings": {
      "adult_content": false,
      "buying_allowed": true,
      "buying_modes": [
        "buy_it_now",
        "auction"
      ],
      "catalog_domain": null,
      "coverage_areas": "not_allowed",
      "currencies": [
        "USD",
        "MXN"
      ],
      "fragile": false,
      "immediate_payment": "required",
      "item_conditions": [
        "new",
        "used",
        "not_specified"
      ],
      "items_reviews_allowed": false,
      "listing_allowed": false,
      "max_description_length": 50000,
      "max_pictures_per_item": 12,
      "max_pictures_per_item_var": 10,
      "max_sub_title_length": 70,
      "max_title_length": 60,
      "maximum_price": null,
      "minimum_price": null,
      "mirror_category": null,
      "mirror_master_category": null,
      "mirror_slave_categories": [
      ],
      "price": "required",
      "reservation_allowed": "not_allowed",
      "restrictions": [
      ],
      "rounded_address": false,
      "seller_contact": "not_allowed",
      "shipping_modes": [
        "custom",
        "not_specified",
        "me1"
      ],
      "shipping_options": [
        "custom",
        "carrier"
      ],
      "shipping_profile": "optional",
      "show_contact_information": false,
      "simple_shipping": "optional",
      "stock": "required",
      "sub_vertical": null,
      "subscribable": false,
      "tags": [
      ],
      "vertical": null,
      "vip_subdomain": "articulo"
    },
    "meta_categ_id": null,
    "attributable": false
  }
}

or in a human-readable way:

{
   "MLMXXXXX":{...},
   "MLMXXXY":{...},
    ...
}

Nonetheless, when I call that function it gets stuck after about saving 3552. I also read that GeoJsonListener loads JSON in memory. My question is how can I create a Listener that loads each object individually instead of loading the whole JSON in memory.

Here's the output of 3552th Item:

{ id: 'MLM45922',
  name: 'Mitsubishi',
  picture: null,
  permalink: null,
  total_items_in_this_category: 39,
  path_from_root: 
   [ { id: 'MLM1747', name: 'Accesorios para Vehículos' },
     { id: 'MLM179617', name: 'Tuning y Performance' },
     { id: 'MLM179724', name: 'Performance' },
     { id: 'MLM4859', name: 'Filtros Alto Flujo' },
     { id: 'MLM45922', name: 'Mitsubishi' } ],
  children_categories: [],
  attribute_types: 'attributes',
  settings: 
   { adult_content: false,
     buying_allowed: true,
     buying_modes: [ 'buy_it_now', 'auction' ],
     catalog_domain: null,
     coverage_areas: 'not_allowed',
     currencies: [ 'USD', 'MXN' ],
     fragile: false,
     immediate_payment: 'required',
     item_conditions: [ 'used', 'not_specified', 'new' ],
     items_reviews_allowed: false,
     listing_allowed: true,
     max_description_length: 50000,
     max_pictures_per_item: 12,
     max_pictures_per_item_var: 10,
     max_sub_title_length: 70,
     max_title_length: 60,
     maximum_price: null,
     minimum_price: null,
     mirror_category: null,
     mirror_master_category: null,
     mirror_slave_categories: [],
     price: 'required',
     reservation_allowed: 'not_allowed',
     restrictions: [],
     rounded_address: false,
     seller_contact: 'not_allowed',
     shipping_modes: [ 'not_specified', 'custom', 'me1', 'me2' ],
     shipping_options: [ 'custom', 'carrier' ],
     shipping_profile: 'optional',
     show_contact_information: false,
     simple_shipping: 'optional',
     stock: 'required',
     sub_vertical: null,
     subscribable: false,
     tags: [],
     vertical: null,
     vip_subdomain: 'articulo' },
  meta_categ_id: null,
  attributable: false }

Solution

  • The GeoJSONListener does exactly what you want to do - it keeps the second level of the objects in memory. That way it loads each MLM object in your file by itself - it does not load the complete file into memory.

    Testing out the code you have included on the file you've referenced (and reducing memory limit to 32M, since a streaming parser shouldn't need 4G of memory), it parses through the whole file, reading through 27200 objects before I cancelled the process, in about 10 minutes on an old Macbook.

    This leads me to believe that the problem isn't related to your JSON parser or how you're parsing the file, and can possibly be caused by something else (like your host / web server not honouring calls to set_time_limit or your database layer locking or barfing on some content.