Posted by
| Nick Gammon
Australia (23,158 posts) Bio
Forum Administrator |
Message
| Below is some Lua code (Lua 5.1, but could be adapted to Lua 5.0 with minor changes) that demonstrates analyzing a text string (or file) to see if it is "spam".
The general technique here is to build up a "corpus" which is a dictionary of words that you have previously decided are spam or not spam.
In the code below I generally refer to the words being "red" or "black" as the general idea can be applied to any sets of words that can be divided into groups, for example:
- spam / not spam
- profanity, or not
- English or French
- Technical writing / everyday speech
You process a file (or batch of text) and indicate whether this particular file (or string) is "red" or "black". Depending on which one, it is given a probability in the corpus, based on how many times it occurs in each group.
For example, the word "and" or "the" might occur in either group, but "make", "more" and "money" might happen more often in the spam group.
Once the corpus has been seeded, you can then supply any text and have it analyzed to see if the combined probability of every word leads us to believe it is spam or not.
First, the code:
--[[
Routine to demonstrate dividing sentences into red/black groups
(eg. spam, not spam)
Based on "A plan for Spam" by Paul Graham:
http://www.paulgraham.com/spam.html
Some C excerpts based on publicly released code by Craig Morrison.
http://sourceforge.net/users/craigbayes/
Also see "The Spam-Filtering Accuracy Plateau at 99.9% Accuracy and
How to GetPast It." by William S. Yerazunis, PhD
http://crm114.sourceforge.net/Plateau_Paper.pdf
Author: Nick Gammon
Date: 16th. September 2006
--]]
-- read/black analyzer in Lua
local corpus = {}
local word_regexp = "([%w]+)"
-- read in the corpus file
function ReadCorpus (name)
for line in io.lines (name) do
local word, red, black = string.match (line,
word_regexp .. ",%s+(%d+),%s+(%d+)")
if word then
corpus [word] = { red = red, black = black }
end -- corpus line
end -- read loop
end -- ReadCorpus
-- save the corpus file
function WriteCorpus (name)
local fprev = io.stdout
local f = io.output (name)
for k, v in pairs (corpus) do
f:write (string.format ("%s, %d, %d, %1.3f\n",
k, v.red, v.black,
CalcProbability (v.red, v.black)))
end -- writing all
f:close () -- close that file now
io.output (prev) -- restore previous output file
end -- WriteCorpus
-- add a string to the corpus
function AddToCorpus (s, red, black)
for w in string.gmatch (s, word_regexp) do
if corpus [w] then -- already in corpus?
corpus [w].red = corpus [w].red + red
corpus [w].black = corpus [w].black + black
else -- add to corpus
corpus [w] = { red = red, black = black }
end
end -- for
end -- AddToCorpus
local C1 = 2 -- weightings
local C2 = 1
local weight = 1
local MAX_WEIGHT = 2.0
-- calculate the probability one word is red or black
function CalcProbability (red, black)
local pResult = ( (black - red) * weight )
/ (C1 * (black + red + C2) * MAX_WEIGHT)
return 0.5 + pResult
end -- CalcProbability
-- load a named file into the corpus
function LoadFile (name, red, black)
local f = io.input (name)
local s = f:read ("*a")
f:close ()
AddToCorpus (s, red, black)
end -- LoadFile
-- load red words (spam)
function LoadRed (name)
LoadFile (name, 1, 0)
end -- LoadRed
-- load black words (ham)
function LoadBlack (name)
LoadFile (name, 0, 1)
end -- LoadBlack
-- See:
-- http://www.paulgraham.com/naivebayes.html
-- For a good explanation of the background, see:
-- http://www.mathpages.com/home/kmath267.htm.
-- calculate the probability a bunch of words are ham (black)
function SetProbability (probs, count)
local n, inv = 1, 1
local i = 0
count = count or #probs
for k, v in pairs (probs) do
n = n * v
inv = inv * (1 - v)
i = i + 1
if i >= count then
break
end -- done enough
end
return n / (n + inv)
end -- SetProbability
-- analyze a string for its probability of spam (red)
function Analyze (s)
local words = {}
-- break string into words, put into local table
for w in string.gmatch (s, word_regexp) do
words [w] = true
end -- for
-- pull out unique words, calculate probability each one is red
local interesting = {}
for k, v in pairs (words) do
if corpus [k] then
table.insert (interesting, CalcProbability (corpus [k].red, corpus [k].black))
else
table.insert (interesting, 0.5) -- default if not in corpus
end -- if in corpus or not
end -- for
-- sort so the "more interesting" ones are at the top
-- that is, either very low probability (eg 0.1, or very high, eg. 0.9)
table.sort (interesting, function (a, b)
return math.abs (0.5 - a) > math.abs (0.5 - b)
end -- sequence function
)
-- return SetProbability (interesting, math.min (#interesting, 30))
return SetProbability (interesting)
end -- Analyze
-- analyze a file
function AnalyzeFile (name)
local f = io.input (name)
local s = f:read ("*a")
f:close ()
print (string.format ("File %s is %2.3f %% likely to be ham",
name, Analyze (s) * 100))
end -- AnalyzeFile
|
- Nick Gammon
www.gammon.com.au, www.mushclient.com | Top |
|