pythonmultithreadingarangodbjoblibpyarango

Data gets mixed up while trying to transfer it to arangodb


I'm trying to transfer ca. 10GB of json data (tweets in my case) to a collection in arangodb. I'm also trying to use joblib for it:

from ArangoConn import ArangoConn
import Userdata as U

import encodings
from joblib import Parallel,delayed
import json
from glob import glob
import time


def progress(total, prog, start, stri = ""):
    if(prog == 0):
        print("")
        prog = 1;
    perc = prog / total
    diff = time.time() - start
    rem = (diff / prog) * (total - prog)
    bar = ""
    for i in range(0,int(perc*20)):
        bar = bar + "|"
    for i in range(int(perc*20),20):
        bar = bar + " "
    print("\r"+"progress: " + "[" + bar + "] " + str(prog) + " of " + 
    str(total) + ": {0:.1f}% ".format(perc * 100) + "- " + 
    time.strftime("%H:%M:%S", time.gmtime(rem)) + " " + stri, end="")

def processfile(filepath):
    file = open(filepath,encoding='utf-8')
    s = file.read()
    file.close()
    data = json.loads(s)
    Parallel(n_jobs=12, verbose=0, backend="threading"
    (map(delayed(ArangoConn.createDocFromObject), data))

files = glob(U.path+'/*.json')
i = 1
j = len(files)
starttime = time.time()
for f in files:
    progress(j,i,starttime,f)
    i = i+1
    processfile(f)

and

from pyArango.connection import Connection
import Userdata as U
import time


class ArangoConn:
    def __init__(self,server,user,pw,db,collectionname):
        self.server = server
        self.user = user
        self.pw = pw
        self.db = db
        self.collectionname = collectionname
        self.connection = None
        self.dbHandle = self.connect()
        if not self.dbHandle.hasCollection(name=self.collectionname):
            coll = self.dbHandle.createCollection(name=collectionname)
        else:
            coll = self.dbHandle.collections[collectionname]
        self.collection = coll

    def db_createDocFromObject(self, obj):
        data = obj.__dict__()
        doc = self.collection.createDocument()
        for key,value in data.items():
            doc[key] = value

        doc._key= str(int(round(time.time() * 1000)))
        doc.save()

    def connect(self):
        self.connection = Connection(arangoURL=self.server + ":8529",         
        username=self.user, password=self.pw)

        if not self.connection.hasDatabase(self.db):
            db = self.connection.createDatabase(name=self.db)
        else:
            db = self.connection.databases.get(self.db)
        return db

    def disconnect(self):
        self.connection.disconnectSession()


    def getAllData(self):

        docs = []
        for doc in self.collection.fetchAll():
            docs.append(self.doc_to_result(doc))
        return docs


    def addData(self,obj):
            self.db_createDocFromObject(obj)

    def search(self,collection,search,prop):
        docs = []
        aql = """FOR q IN """+collection+""" FILTER q."""+prop+""" LIKE 
            "%"""+search+"""%" RETURN q"""
        results = self.dbHandle.AQLQuery(aql, rawResults=False, batchSize=1)
        for doc in results:
            docs.append(self.doc_to_result(doc))
        return docs


    def doc_to_result(self,arangodoc):
        modstore = arangodoc.getStore()
        modstore["_key"] = arangodoc._key
        return modstore

    def db_createDocFromJson(self,json):

        for d in json:
            doc = self.collection.createDocument()
            for key,value in d.items():
                doc[key] = value
            doc._key = str(int(round(time.time() * 1000)))
            doc.save()



    @staticmethod
    def createDocFromObject(obj):
        c = ArangoConn(U.url, U.user, U.pw, U.db, U.collection)
        data = obj
        doc = c.collection.createDocument()
        for key, value in data.items():
            doc[key] = value
        doc._key = doc["id"]
        doc.save()
        c.connection.disconnectSession()

It kinda works like that. My problem is that the data that lands in the database is somehow mixed up.

Screenshot

as you can see in the screenshot "id" and "id_str" are not the same - as they should be.

what i investigated so far:

But as soon as I communicate with the db the data mixes up.

My professor thought that maybe something in pyarango isn't threadsafe and it messes up the data but I don't think so as threading doesn't seem to be the problem.

I have no ideas left where this behavior could come from... Any ideas?


Solution

  • The screenshot shows the following values:

    id     : 892886691937214500
    id_str : 892886691937214465
    

    It looks like somewhere along the way the value is converted to an IEEE754 double, which cannot safely represent the latter value. So there is potentially some precision loss due to conversion.

    A quick example in node.js (JavaScript is using IEEE754 doubles for any number values greater than 0xffffffff) shows that this is likely the problem cause:

    $ node
    > 892886691937214500
    892886691937214500
    > 892886691937214465
    892886691937214500
    

    So the question is where the conversion does happen. Can you check whether the python client program is correctly sending the expected values to ArangoDB, or does it already send the converted/truncated values?

    In general, any integer number that exceeds 0x7fffffffffffffff will be truncated when stored in ArangoDB, or converted to an IEEE754 double. This can be avoided by storing the number values inside a string, but of course comparing two number strings will produce different results than comparing two numbers (e.g. "10" < "9" vs. 10 > 9).