I tried this script with my file which contains approx 16 columns and 5243 lines ,
the first column are respectively the key (just integers 1 to 5243) and the second column is the values which are sentences (the sentences can be very long up to paragraph )
When i tried with small file it works beut with the true file , it is not working.
# -*- coding: UTF-8 -*-
import codecs
import re
import os
import sys, argparse
import subprocess
import pprint
import csv
from itertools import islice
import pickle
import treetaggerwrapper
from treetaggerwrapper import TreeTagger, make_tags
print("import TreeTagger OK")
print("Import TreeTagger pas Ok")
from itertools import islice
from collections import defaultdict
#export le lexique de sentiments
pickle_in = open("dict_pickle", "rb")
dico_lexique = pickle.load(pickle_in)
# extraction colonne verbatim
d = {}
with open(sys.argv[1], 'r', encoding='cp1252',) as csv_file:
for line in csv_file:
token = line.split(';')
d[token[0]] = token[1]
tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr')
d_tag = {}
for key, val in d.items():
newvalues = tagger.tag_text(val)
d_tag[key] = newvalues
d_lemma = defaultdict(list)
for k, v in d_tag.items():
for elem in v:
parts = elem.split('\t')
import TreeTagger OK
Traceback (most recent call last):
File "CSV_dico.py", line 50, in <module>
IndexError: list index out of range
abordables ADJ abordable
sur PRP sur
le DET:ART le
marché NOM marché
. SENT .
Moins ADV moins
cher ADV cher
... PUN ...
25 NUM @card@
% SYM %
de PRP de
moins ADV moins
... PUN ...
est VER:pres être
quand ADV quand
-même ADJ même
moins ADV moins
qualitatif ADJ qualitatif
qu' KON que
un DET:ART un
seau NOM seau
! SENT !
Not an solution but a tip for finding the error:
Try changing this part:
# extraction colonne verbatim
d = {}
with open(sys.argv[1], 'r', encoding='cp1252',) as csv_file:
for line in csv_file:
token = line.split(';')
d[token[0]] = token[1]
To this:
# extraction colonne verbatim
d = {}
with open(sys.argv[1], 'r', encoding='cp1252',) as csv_file:
for line in csv_file:
token = line.split(';')
d[token[0]] = token[1]
This should give you the wrong token line and you can check it for erros