Count Word Frequency in a Text

Count the frequency of each word in a line of a text-Pyspark

The find the number of occurrences of word in every line and combine them together:

  1. Map the elements in the RDD such that the line number and the word becomes the key (i.e, ), (0, ('This', 1)) becomes ((0, 'This'), 1)
  2. ReduceByKey the RDD from step 1 by summing the number of occurences
  3. Remap the results from step 2 to make the line number as key
  4. ReduceByKey to combine the tuple of (word, total line count)
rdd = spark.sparkContext.parallelize([(0, ('a', 1)), (0, ('b', 1)), (0, ('a', 1)), (1, ('a', 1))])

occurences_per_line = rdd.map(lambda x: ((x[0], x[1][0]), x[1][1])).reduceByKey(lambda x, y: x + y)

occurences_per_line.map(lambda x: (x[0][0], (x[0][1], x[1]))).reduceByKey(lambda x, y: x + y).collect()

"""
[(0, ('a', 2, 'b', 1)), (1, ('a', 1))]
"""

Efficient and fast way to counting word frequency and sorting the list in python

I achieved efficient and fast logic by utilizing collections.Counter, Cython and multiprocessing.Pool. I replace the counting part with Counter and utilizing Cython and multiprocessing.Pool for efficiency.

The below is the entire code:

from collections import defaultdict, Counter
from typing import DefaultDict

def count_words(top_pos: DefaultDict, top_neg: DefaultDict, top_rel: DefaultDict, data: pd.DataFrame):
print(data)
if isinstance(data.loc[:, "3"].values[0], str):
data_pos = data.loc[:, "1"].apply(lambda x: Counter(ast.literal_eval(x)))
data_neg = data.loc[:, "2"].apply(lambda x: Counter(ast.literal_eval(x)))
data_rel = data.loc[:, "3"].apply(lambda x: Counter(ast.literal_eval(x)))

print(data_pos)
print(data_neg)
print(data_rel)
for item in data_pos:
for k, v in item.items():
top_pos[k] += v
for item in data_neg:
for k, v in item.items():
top_neg[k] += v
for item in data_rel:
for k, v in item.items():
top_rel[k] += v

elif isinstance(data.loc[:, "3"].values[0], list):

print(data_pos)
print(data_neg)
print(data_rel)
data_pos = data.loc[:, "1"].apply(lambda x: Counter(x))
data_neg = data.loc[:, "2"].apply(lambda x: Counter(x))
data_rel = data.loc[:, "3"].apply(lambda x: Counter(x))
for item in data_pos:
for k, v in item.items():
top_pos[k] += v
for item in data_neg.items():
for k, v in item.items():
top_neg[k] += v
for item in data_rel.items():
for k, v in item.items():
top_rel[k] += v
else:
raise ValueError("The type must be either list or str")

return top_pos, top_neg, top_rel

def test(data, top_word_id):
for i, item in top_word_id.iterrows():
id = item.loc["category_code"]
target = item.loc['target']
code = item.loc['region_code']

category_data = data[data.loc[:, id] == 1]

if category_data.shape[0] == 0:
continue

temp = category_data[(category_data.loc[:, 'target'] == target) & (category_data.loc[:, 'code'] == code)]

if temp.shape[0] == 0:
continue

top_pos, top_neg, top_rel = count_words(top_word_id.loc[i, "weekly_positive_top_word"], top_word_id.loc[i, "weekly_negative_top_word"], top_word_id.loc[i, "weekly_associated_top_word"], data)
top_word_id.at[i, "weekly_associated_top_word"] = top_rel
top_word_id.at[i, "weekly_positive_top_word"] = top_pos
top_word_id.at[i, "weekly_negative_top_word"] = top_neg

return top_word_id

from multiprocessing import Pool, cpu_count
from contextlib import contextmanager
import numpy as np

@contextmanager
def poolcontext(*args, **kwargs):
try:
pool = Pool(*args, **kwargs)
yield pool
finally:
pool.terminate()

def parallelize_aggregation(data, top_word_id, func):
num_cores = cpu_count()
df_split = np.array_split(top_word_id, num_cores, axis=0)

with poolcontext(processes=num_cores) as pool:
results = pool.starmap(test, zip([data for _ in range(num_cores)], df_split))
return results

parallelize_aggregation(data, top_word_id, aggregate.test)

The below table illustrates times of the codes:























CodeTimes
Cython (the code in the question)4749s
Cython + Counter3066s
Cython + Counter + multiprocessing.Pool10s

Count frequency of multi-word terms in large texts with Python

@SidharthMacherla brought me on the right track (NLTK and tokenization), although his solution does not address the problem of multi-word expressions, which, moreover, might be overlapping.

In brief, the best method I've found is to subclass NLTK's MWETokenizer and to add a function for counting the multi-words using util.Trie:

import re, regex, timeit
from nltk.tokenize import MWETokenizer
from nltk.util import Trie

class FreqMWETokenizer(MWETokenizer):
"""A tokenizer that processes tokenized text and merges multi-word expressions
into single tokens.
"""

def __init__(self, mwes=None, separator="_"):
super().__init__(mwes, separator)

def freqs(self, text):
"""
:param text: A list containing tokenized text
:type text: list(str)
:return: A frequency dictionary with multi-words merged together as keys
:rtype: dict
:Example:
>>> tokenizer = FreqMWETokenizer([ mw.split() for mw in ['multilayer ceramic', 'multilayer ceramic capacitor', 'ceramic capacitor']], separator=' ')
>>> tokenizer.freqs("Gimme that multilayer ceramic capacitor please!".split())
{'multilayer ceramic': 1, 'multilayer ceramic capacitor': 1, 'ceramic capacitor': 1}
"""
i = 0
n = len(text)
result = {}

while i < n:
if text[i] in self._mwes:
# possible MWE match
j = i
trie = self._mwes
while j < n and text[j] in trie:
if Trie.LEAF in trie:
# success!
mw = self._separator.join(text[i:j])
result[mw]=result.get(mw,0)+1
trie = trie[text[j]]
j = j + 1
else:
if Trie.LEAF in trie:
# success!
mw = self._separator.join(text[i:j])
result[mw]=result.get(mw,0)+1
i += 1
else:
i += 1

return result

>>> tokenizer = FreqMWETokenizer([ mw.split() for mw in ['multilayer ceramic', 'multilayer ceramic capacitor', 'ceramic capacitor']], separator=' ')
>>> tokenizer.freqs("Gimme that multilayer ceramic capacitor please!".split())
{'multilayer ceramic': 1, 'multilayer ceramic capacitor': 1, 'ceramic capacitor': 1}

Here's the test suite with speed measures:

Counting 10k multi-word terms in 10m characters took 2 seconds with FreqMWETokenizer, 4 seconds with the MWETokenizer (a complete tokenization is also provided but no overlaps are counted), 150 seconds with the simple count method, and 1000 seconds with a large regex. Trying 100k multi-word terms in 100m characters remains doable with tokenizers not with counting or regex.

For testing, please find the two large sample files at https://mega.nz/file/PsVVWSzA#5-OHy-L7SO6fzsByiJzeBnAbtJKRVy95YFdjeF_7yxA


def freqtokenizer(thissampledict, thissampletext):
"""
This method uses the above FreqMWETokenizer's function freqs.
It captures overlapping multi-words

counting 1000 terms in 1000000 characters took 0.3222855870008061 seconds. found 0 terms from the list.
counting 10000 terms in 10000000 characters took 2.5309120759993675 seconds. found 21 terms from the list.
counting 100000 terms in 29467534 characters took 10.57763242800138 seconds. found 956 terms from the list.
counting 743274 terms in 29467534 characters took 25.613067482998304 seconds. found 10411 terms from the list.
"""
tokenizer = FreqMWETokenizer([mw.split() for mw in thissampledict], separator=' ')
thissampletext = re.sub(' +',' ', re.sub('[^\s\w\/\-\']+',' ',thissampletext)) # removing punctuation except /-'_
freqs = tokenizer.freqs(thissampletext.split())
return freqs

def nltkmethod(thissampledict, thissampletext):
""" This function first produces a tokenization by means of MWETokenizer.
This takes the biggest matching multi-word, no overlaps.
They could be computed separately on the dictionary.

counting 1000 terms in 1000000 characters took 0.34804968100070255 seconds. found 0 terms from the list.
counting 10000 terms in 10000000 characters took 3.9042628339993826 seconds. found 20 terms from the list.
counting 100000 terms in 29467534 characters took 12.782784996001283 seconds. found 942 terms from the list.
counting 743274 terms in 29467534 characters took 28.684293715999956 seconds. found 9964 terms from the list.

"""
termfreqdic = {}
tokenizer = MWETokenizer([mw.split() for mw in thissampledict], separator=' ')
thissampletext = re.sub(' +',' ', re.sub('[^\s\w\/\-\']+',' ',thissampletext)) # removing punctuation except /-'_
tokens = tokenizer.tokenize(thissampletext.split())
freqdist = FreqDist(tokens)
termsfound = set([t for t in freqdist.keys()]) & set(thissampledict)
for t in termsfound:termfreqdic[t]=freqdist[t]
return termfreqdic

def countmethod(thissampledict, thissampletext):
"""
counting 1000 in 1000000 took 0.9351876619912218 seconds.
counting 10000 in 10000000 took 91.92642056700424 seconds.
counting 100000 in 29467534 took 3185.7411157219904 seconds.
"""
termfreqdic = {}
for term in thissampledict:
termfreqdic[term] = thissampletext.count(term)
return termfreqdic

def regexmethod(thissampledict, thissampletext):
"""
counting 1000 terms in 1000000 characters took 2.298602456023218 seconds.
counting 10000 terms in 10000000 characters took 395.46084802100086 seconds.
counting 100000: impossible
"""
termfreqdic = {}
termregex = re.compile(r'\b'+r'\b|\b'.join(thissampledict))
for m in termregex.finditer(thissampletext):
termfreqdic[m.group(0)]=termfreqdic.get(m.group(0),0)+1
return termfreqdic

def timing():
"""
for testing, find the two large sample files at
https://mega.nz/file/PsVVWSzA#5-OHy-L7SO6fzsByiJzeBnAbtJKRVy95YFdjeF_7yxA
"""
sampletext=open("G06K0019000000.txt").read().lower()
sampledict=open("manyterms.lower.txt").read().strip().split('\n')
print(len(sampletext),'characters',len(sampledict),'terms')

for i in range(4):
for f in [freqtokenizer, nltkmethod, countmethod, regexmethod]:
start = timeit.default_timer()
thissampledict = sampledict[:1000*10**i]
thissampletext = sampletext[:1000000*10**i]

termfreqdic = f(thissampledict, thissampletext)
#termfreqdic = countmethod(thissampledict, thissampletext)
#termfreqdic = regexmethod(thissampledict, thissampletext)
#termfreqdic = nltkmethod(thissampledict, thissampletext)
#termfreqdic = freqtokenizer(thissampledict, thissampletext)

print('{f} counting {terms} terms in {characters} characters took {seconds} seconds. found {termfreqdic} terms from the list.'.format(f=f, terms=len(thissampledict), characters=len(thissampletext), seconds=timeit.default_timer()-start, termfreqdic=len({a:v for (a,v) in termfreqdic.items() if v})))

timing()

Total Frequency Count for words using NLTK Python

It is running, you just need to print the repr of fdist to see some of its content, or use fdist.items or dict on it to see all the content:

>>> print(repr(fdist)) # repr
FreqDist({'.': 4, 'he': 4, 'the': 2, 'when': 2, ',': 2, 'glass': 2, 'of': 2, 'water': 2, 'bob': 1, 'went': 1, ...})
>>> fdist.items() # items
dict_items([('bob', 1), ('went', 1), ('down', 1), ('the', 2), ('street', 1), ('to', 1), ('purchase', 1), ('groceries', 1), ('.', 4), ('when', 2), ('he', 4), ('was', 1), ('walking', 1), ('back', 1), (',', 2), ('it', 1), ('became', 1), ('very', 1), ('hot', 1), ('outside', 1), ('cameback', 1), ('drank', 1), ('a', 1), ('cold', 1), ('glass', 2), ('of', 2), ('water', 2), ('after', 1), ('drinking', 1), ('felt', 1), ('much', 1), ('more', 1), ('cooler', 1), ('in', 1), ('temperature', 1)])
>>> dict(fdist) # dict
{'bob': 1, 'went': 1, 'down': 1, 'the': 2, 'street': 1, 'to': 1, 'purchase': 1, 'groceries': 1, '.': 4, 'when': 2, 'he': 4, 'was': 1, 'walking': 1, 'back': 1, ',': 2, 'it': 1, 'became': 1, 'very': 1, 'hot': 1, 'outside': 1, 'cameback': 1, 'drank': 1, 'a': 1, 'cold': 1, 'glass': 2, 'of': 2, 'water': 2, 'after': 1, 'drinking': 1, 'felt': 1, 'much': 1, 'more': 1, 'cooler': 1, 'in': 1, 'temperature': 1}



Related Topics



Leave a reply



Submit