pythonnltkpickledependency-graph

save a dependecy graph in python


I am using in python3 the stanford dependency parser to parse a sentence, which returns a dependency graph.

import pickle
from nltk.parse.stanford import StanfordDependencyParser

parser = StanfordDependencyParser('stanford-parser-full-2015-12-09/stanford-parser.jar', 'stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar')
sentences = ["I am going there","I am asking a question"]
with open("save.p","wb") as f:
      pickle.dump(parser.raw_parse_sents(sentences),f)

It gives an error :

AttributeError: Can't pickle local object 'DependencyGraph.__init__.<locals>.<lambda>'

I wonder if I could save a dependency graph either with or without pickle.


Solution

  • Following instructions to get a parsed output.

    1. Output DependencyGraph to CONLL format and write to file

    (See What is CoNLL data format? and What does the dependency-parse output of TurboParser mean?)

    $ export STANFORDTOOLSDIR=$HOME
    $ export CLASSPATH=$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser.jar:$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar
    $ python
    >>> from nltk.parse.stanford import StanfordDependencyParser
    >>> dep_parser=StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
    >>> sent = "The quick brown fox jumps over the lazy dog."
    >>> output = next(dep_parser.raw_parse("The quick brown fox jumps over the lazy dog."))
    >>> type(output)
    <class 'nltk.parse.dependencygraph.DependencyGraph'>
    >>> output.to_conll(style=4) # The *style* parameter just means that we want 4 columns in the CONLL format
    u'The\tDT\t4\tdet\nquick\tJJ\t4\tamod\nbrown\tJJ\t4\tamod\nfox\tNN\t5\tnsubj\njumps\tVBZ\t0\troot\nover\tIN\t9\tcase\nthe\tDT\t9\tdet\nlazy\tJJ\t9\tamod\ndog\tNN\t5\tnmod\n'
    >>> with open('sent.conll', 'w') as fout:
    ...     fout.write(output.to_conll(4))
    ... 
    >>> exit()
    $ cat sent.conll
    The DT  4   det
    quick   JJ  4   amod
    brown   JJ  4   amod
    fox NN  5   nsubj
    jumps   VBZ 0   root
    over    IN  9   case
    the DT  9   det
    lazy    JJ  9   amod
    dog NN  5   nmod
    

    2. Read the CONLL file into a DependencyGraph in NLTK

    >>> from nltk.parse.dependencygraph import DependencyGraph
    >>> output = DependencyGraph.load('sent.conll') # Loads any CONLL file, that might contain 1 or more sentences.
    >>> output # list of DependencyGraphs
    [<DependencyGraph with 10 nodes>]
    >>> output[0] # the first DependencyGraph, the one we have saved
    <DependencyGraph with 10 nodes>
    >>> print output[0]
    defaultdict(<function <lambda> at 0x10e83c758>, {0: {u'ctag': u'TOP', u'head': None, u'word': None, u'deps': defaultdict(<type 'list'>, {u'ROOT': [], u'root': [5]}), u'lemma': None, u'tag': u'TOP', u'rel': None, u'address': 0, u'feats': None}, 1: {u'ctag': u'DT', u'head': 4, u'deps': defaultdict(<type 'list'>, {}), u'tag': u'DT', u'address': 1, u'word': u'The', u'lemma': u'The', u'rel': u'det', u'feats': u''}, 2: {u'ctag': u'JJ', u'head': 4, u'deps': defaultdict(<type 'list'>, {}), u'tag': u'JJ', u'address': 2, u'word': u'quick', u'lemma': u'quick', u'rel': u'amod', u'feats': u''}, 3: {u'ctag': u'JJ', u'head': 4, u'deps': defaultdict(<type 'list'>, {}), u'tag': u'JJ', u'address': 3, u'word': u'brown', u'lemma': u'brown', u'rel': u'amod', u'feats': u''}, 4: {u'ctag': u'NN', u'head': 5, u'deps': defaultdict(<type 'list'>, {u'det': [1], u'amod': [2, 3]}), u'tag': u'NN', u'address': 4, u'word': u'fox', u'lemma': u'fox', u'rel': u'nsubj', u'feats': u''}, 5: {u'ctag': u'VBZ', u'head': 0, u'deps': defaultdict(<type 'list'>, {u'nmod': [9], u'nsubj': [4]}), u'tag': u'VBZ', u'address': 5, u'word': u'jumps', u'lemma': u'jumps', u'rel': u'root', u'feats': u''}, 6: {u'ctag': u'IN', u'head': 9, u'deps': defaultdict(<type 'list'>, {}), u'tag': u'IN', u'address': 6, u'word': u'over', u'lemma': u'over', u'rel': u'case', u'feats': u''}, 7: {u'ctag': u'DT', u'head': 9, u'deps': defaultdict(<type 'list'>, {}), u'tag': u'DT', u'address': 7, u'word': u'the', u'lemma': u'the', u'rel': u'det', u'feats': u''}, 8: {u'ctag': u'JJ', u'head': 9, u'deps': defaultdict(<type 'list'>, {}), u'tag': u'JJ', u'address': 8, u'word': u'lazy', u'lemma': u'lazy', u'rel': u'amod', u'feats': u''}, 9: {u'ctag': u'NN', u'head': 5, u'deps': defaultdict(<type 'list'>, {u'case': [6], u'det': [7], u'amod': [8]}), u'tag': u'NN', u'address': 9, u'word': u'dog', u'lemma': u'dog', u'rel': u'nmod', u'feats': u''}})
    

    Note that the output of the StanfordParser is an nltk.tree.Tree not a DependencyGraph (This is just in-case someone post a similar question on the Tree.

    $ export STANFORDTOOLSDIR=$HOME
    $ export CLASSPATH=$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser.jar:$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar
    $ python
    >>> from nltk.parse.stanford import StanfordParser
    >>> parser=StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
    >>> list(parser.raw_parse("the quick brown fox jumps over the lazy dog"))
    [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])])]
    >>> output = list(parser.raw_parse("the quick brown fox jumps over the lazy dog"))
    >>> type(output[0])
    <class 'nltk.tree.Tree'>
    

    For nltk.tree.Tree you can output it as a bracketed parse string and read the string into a Tree object:

    >>> from nltk import Tree
    >>> output[0]
    Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])])
    >>> str(output[0])
    '(ROOT\n  (NP\n    (NP (DT the) (JJ quick) (JJ brown) (NN fox))\n    (NP\n      (NP (NNS jumps))\n      (PP (IN over) (NP (DT the) (JJ lazy) (NN dog))))))'
    >>> parsed_sent = str(output[0])
    >>> type(parsed_sent)
    <type 'str'>
    >>> Tree.fromstring(parsed_sent)
    Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])])
    >>> parsed_tree = Tree.fromstring(parsed_sent)
    >>> type(parsed_tree)
    <class 'nltk.tree.Tree'>