forked from TheAlgorithms/Python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathword_frequency_functions.py
137 lines (120 loc) · 4.98 KB
/
word_frequency_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import string
from math import log10
"""
tf-idf Wikipedia: https://en.wikipedia.org/wiki/Tf%E2%80%93idf
tf-idf and other word frequency algorithms are often used
as a weighting factor in information retrieval and text
mining. 83% of text-based recommender systems use
tf-idf for term weighting. In Layman's terms, tf-idf
is a statistic intended to reflect how important a word
is to a document in a corpus (a collection of documents)
Here I've implemented several word frequency algorithms
that are commonly used in information retrieval: Term Frequency,
Document Frequency, and TF-IDF (Term-Frequency*Inverse-Document-Frequency)
are included.
Term Frequency is a statistical function that
returns a number representing how frequently
an expression occurs in a document. This
indicates how significant a particular term is in
a given document.
Document Frequency is a statistical function that returns
an integer representing the number of documents in a
corpus that a term occurs in (where the max number returned
would be the number of documents in the corpus).
Inverse Document Frequency is mathematically written as
log10(N/df), where N is the number of documents in your
corpus and df is the Document Frequency. If df is 0, a
ZeroDivisionError will be thrown.
Term-Frequency*Inverse-Document-Frequency is a measure
of the originality of a term. It is mathematically written
as tf*log10(N/df). It compares the number of times
a term appears in a document with the number of documents
the term appears in. If df is 0, a ZeroDivisionError will be thrown.
"""
def term_frequency(term: str, document: str) -> int:
"""
Return the number of times a term occurs within
a given document.
@params: term, the term to search a document for, and document,
the document to search within
@returns: an integer representing the number of times a term is
found within the document
@examples:
>>> term_frequency("to", "To be, or not to be")
2
"""
# strip all punctuation and newlines and replace it with ''
document_without_punctuation = document.translate(
str.maketrans("", "", string.punctuation)
).replace("\n", "")
tokenize_document = document_without_punctuation.split(" ") # word tokenization
return len([word for word in tokenize_document if word.lower() == term.lower()])
def document_frequency(term: str, corpus: str) -> tuple[int, int]:
"""
Calculate the number of documents in a corpus that contain a
given term
@params : term, the term to search each document for, and corpus, a collection of
documents. Each document should be separated by a newline.
@returns : the number of documents in the corpus that contain the term you are
searching for and the number of documents in the corpus
@examples :
>>> document_frequency("first", "This is the first document in the corpus.\\nThIs\
is the second document in the corpus.\\nTHIS is \
the third document in the corpus.")
(1, 3)
"""
corpus_without_punctuation = corpus.lower().translate(
str.maketrans("", "", string.punctuation)
) # strip all punctuation and replace it with ''
docs = corpus_without_punctuation.split("\n")
term = term.lower()
return (len([doc for doc in docs if term in doc]), len(docs))
def inverse_document_frequency(df: int, N: int, smoothing=False) -> float:
"""
Return an integer denoting the importance
of a word. This measure of importance is
calculated by log10(N/df), where N is the
number of documents and df is
the Document Frequency.
@params : df, the Document Frequency, N,
the number of documents in the corpus and
smoothing, if True return the idf-smooth
@returns : log10(N/df) or 1+log10(N/1+df)
@examples :
>>> inverse_document_frequency(3, 0)
Traceback (most recent call last):
...
ValueError: log10(0) is undefined.
>>> inverse_document_frequency(1, 3)
0.477
>>> inverse_document_frequency(0, 3)
Traceback (most recent call last):
...
ZeroDivisionError: df must be > 0
>>> inverse_document_frequency(0, 3,True)
1.477
"""
if smoothing:
if N == 0:
raise ValueError("log10(0) is undefined.")
return round(1 + log10(N / (1 + df)), 3)
if df == 0:
raise ZeroDivisionError("df must be > 0")
elif N == 0:
raise ValueError("log10(0) is undefined.")
return round(log10(N / df), 3)
def tf_idf(tf: int, idf: int) -> float:
"""
Combine the term frequency
and inverse document frequency functions to
calculate the originality of a term. This
'originality' is calculated by multiplying
the term frequency and the inverse document
frequency : tf-idf = TF * IDF
@params : tf, the term frequency, and idf, the inverse document
frequency
@examples :
>>> tf_idf(2, 0.477)
0.954
"""
return round(tf * idf, 3)