The main method, which I use to train the spam-filter:
- yield the catalog [from corpus import Corpus]
- every word in message, if it has "SPAM" or "HAM" classification,
add to appropriate list
- create lists with only unique words (using previous lists)
- create the spamicity / hamicity of words (dictionary
'word':-icity)
- calculate the probability of spam and ham
- using sets of dictionary keys - create the general vocabulary
Problems: train() works too long "Method train() takes too long to execute, 5.0 minutes allowed.", also if I use regexp such as: w+(?:.?w+)*@w+(?:.?w+)|w+
the test loops.
By this reg exp I try to split the message into the list of any words and email addresses if they exist
method:
def __init__(self):
self.spam = [] # trained spam words
self.ham = [] # trained ham words
self.spamicity = {} # dictionary of the spam words
self.hamicity = {} # dictionary of the ham words
self.unique_spam = []
self.unique_ham = []
self.vocabulary = {}
def train(self, direct):
""" Train filter with classified data"""
classification = read_classification_from_file(direct + "/!truth.txt")
corp = Corpus(direct)
for header, message in corp.emails():
message_split = self.reg_splitter(message)
for word in message_split:
if classification[header] == 'SPAM':
self.spam.append(word)
else:
self.ham.append(word)
# FULLY LOADED LISTS
self.unique_spam = self.create_vocabulary(self.spam)
self.unique_ham = self.create_vocabulary(self.ham)
self.spamicity = self.create_the_icity(self.unique_spam, self.spam)
self.hamicity = self.create_the_icity(self.unique_ham, self.ham)
spam = len(self.spam)
ham = len(self.ham)
prob_spam = spam / (spam + ham)
prob_ham = ham / (spam + ham)
for word in (set(self.spamicity.keys()) | set(self.hamicity.keys())):
if word in self.hamicity.keys() and word in self.spamicity.keys():
self.vocabulary[word] = self.spamicity[word] / spam * prob_spam / (
self.spamicity[word] / spam * prob_spam + self.hamicity[word] / ham * prob_ham)
elif word in self.hamicity.keys() and word not in self.spamicity.keys():
self.vocabulary[word] = self.hamicity[word]
elif word in self.spamicity.keys() and word not in self.hamicity.keys():
self.vocabulary[word] = self.spamicity[word]
additional methods (corpus and read_classification_from_file work fine):
def create_vocabulary(self, train_list):
word_dict_without_value = dict.fromkeys(train_list, 0)
unique_words = list(word_dict_without_value) # convert the dict to list will make the list of "keys"
return unique_words
def create_the_icity(self, unique_list, train_list):
dict_of_icity = {}
for word in unique_list:
emails_with_w = 0 # counter
for rec in train_list:
if word in rec: # if the word is inside that record -> counter++
emails_with_w += 1
total_of = len(train_list) # means total of spam or ham
the_icity = (emails_with_w + 1) / (total_of + 2) # for not to get 0, but 0.5
dict_of_icity[word.lower()] = the_icity
return dict_of_icity
def reg_splitter(self, message):
exp = re.compile(r"w+") #w+(?:.?w+)*@w+(?:.?w+)|
split_message = re.findall(exp, message.lower())
return split_message
question from:
https://stackoverflow.com/questions/65829405/filtering-the-message-catalog-using-bayesian-spam-filtering-loop-on-regular-ex 与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…