I am using Python mrjob to find the 10 longest words from a text file. I have obtained a result, but the result contains duplicate words. How do I obtain only unique words (ie. remove duplicate words)?
%%file most_chars.py
from mrjob.job import MRJob
from mrjob.step import MRStep
import re
WORD_RE = re.compile(r"[\w']+") # any whitespace or apostrophe, used to split lines below
class MostChars(MRJob):
def steps(self):
return [
MRStep(mapper=self.mapper_get_words,
reducer=self.reducer_find_longest_words)
]
def mapper_get_words(self, _, line):
for word in WORD_RE.findall(line):
yield None, (len(word), word.lower().strip())
# discard the key; it is just None
def reducer_find_longest_words(self, _, word_count_pairs):
# each item of word_count_pairs is (count, word),
# so yielding one results in key=counts, value=word
sorted_pair = sorted(word_count_pairs, reverse=True)
for pair in sorted_pair[0:10]:
yield pair
if __name__ == '__main__':
MostChars.run()
Actual Output:
18 "overcapitalization"
18 "overcapitalization"
18 "overcapitalization"
17 "uncomprehendingly"
17 "misunderstandings"
17 "disinterestedness"
17 "disinterestedness"
17 "disinterestedness"
17 "disinterestedness"
17 "conventionalities"
Expected Output:
18 "overcapitalization"
17 "uncomprehendingly"
17 "misunderstandings"
17 "disinterestedness"
17 "conventionalities"
and 5 more unique words
Update reducer_find_longest_words
to get only the unique elements. Note the use of list(set())
.
def reducer_find_longest_words(self, _, word_count_pairs):
# each item of word_count_pairs is (count, word),
# so yielding one results in key=counts, value=word
unique_pairs = [list(x) for x in set(tuple(x) for x in word_count_pairs)]
sorted_pair = sorted(unique_pairs, reverse=True)
for pair in sorted_pair[0:10]:
yield pair