Commit 33459f27 authored by Natanael Copa's avatar Natanael Copa

use iterator functions to import logfiles

Rather than passing over a table with the entire parsed logfile we pass
over an iterator function.

This way we dont need to have the entire logfile in memory at the same
time and we can handle extremely big logfiles without consume lots of
memory.
parent 12abd7f8
......@@ -180,7 +180,7 @@ end
local importsquidlog = function(logentries, sourcename)
con:execute("START TRANSACTION")
for i,entry in pairs(logentries) do
for entry in logentries do
local sql = string.format("INSERT INTO weblog VALUES ('%s', '%s', '%s', '%s', '%s', '%s')",
escape(sourcename), escape(entry.clientip), escape(entry.clientuserid:lower()),
escape(entry.logdatetime), escape(entry.URL), escape(entry.bytes))
......@@ -191,7 +191,7 @@ end
local importdglog = function(logentries, sourcename)
con:execute("START TRANSACTION")
for i,entry in pairs(logentries) do
for entry in logentries do
local sql = string.format("INSERT INTO blocklog VALUES ('%s', '0.0.0.0', '%s', '%s', '%s', '%s', '%s', '%s', '%s')",
escape(sourcename), escape(entry.clientuserid:lower()), escape(entry.logdatetime), escape(entry.URL),
escape(entry.bytes), escape(entry.reason), escape(entry.score or "0"), escape(entry.shortreason))
......@@ -552,39 +552,71 @@ end
-- ################################################################################
-- LOG FILE FUNCTIONS
local parsesquidlog = function(f)
local logentries = {}
for line in f:lines() do
-- Format of squid log (space separated):
-- time elapsed remotehost code/status bytes method URL rfc931 peerstatus/peerhost
local words = {}
for word in string.gmatch(line, "%S+") do
words[#words+1] = word
end
local logentry = {logdatetime=words[1], elapsed=words[2], clientip=words[3], code=string.match(words[4], "^[^/]*"), status=string.match(words[4], "[^/]*$"), bytes=words[5], method=words[6], URL=words[7], clientuserid=words[8], peerstatus=string.match(words[9], "^[^/]*"), peerhost=string.match(words[9], "[^/]*$")}
logentry.logdatetime = os.date("%Y-%m-%d %H:%M:%S", logentry.logdatetime)..string.match(logentry.logdatetime, "%..*")
-- Don't care about local requests (from DG)
if logentry.clientip ~= "127.0.0.1" then
logentries[#logentries+1] = logentry
local function parsesquidlog_line(line)
-- Format of squid log (space separated):
-- time elapsed remotehost code/status bytes method URL rfc931 peerstatus/peerhost
local words = {}
for word in string.gmatch(line, "%S+") do
words[#words+1] = word
end
local logentry = {logdatetime=words[1],
elapsed=words[2],
clientip=words[3],
code=string.match(words[4], "^[^/]*"),
status=string.match(words[4], "[^/]*$"),
bytes=words[5],
method=words[6],
URL=words[7],
clientuserid=words[8],
peerstatus=string.match(words[9], "^[^/]*"),
peerhost=string.match(words[9], "[^/]*$")}
logentry.logdatetime = os.date("%Y-%m-%d %H:%M:%S", logentry.logdatetime)..string.match(logentry.logdatetime, "%..*")
return logentry
end
local function parsesquidlog_iter(f)
return function()
while true do
line = f:read("*line")
if line == nil then
return nil
end
local logentry = parsesquidlog_line(line)
-- Don't care about local requests (from DG)
if logentry.clientip ~= "127.0.0.1" then
return logentry
end
end
end
return logentries
end
local parsedglog = function(f)
local logentries = {}
for line in f:lines() do
local words = format.string_to_table(line, "\t")
local logentry = {logdatetime=words[1], clientuserid=words[2], clientip=words[3], URL=words[4], reason=words[5], method=words[6], bytes=words[7], shortreason=words[9]}
if logentry.reason ~= "" then
if logentry.shortreason == "" then logentry.shortreason = logentry.reason end
logentry.score = string.match(logentry.reason, "^.*: ([0-9]+) ")
logentry.logdatetime = string.gsub(logentry.logdatetime, "%.", "-")
local function parsedglog_line(line)
local words = format.string_to_table(line, "\t")
return { logdatetime=words[1], clientuserid=words[2], clientip=words[3],
URL=words[4], reason=words[5], method=words[6], bytes=words[7],
shortreason=words[9]}
end
local function parsedglog_iter(f)
return function()
while true do
line = f:read("*line")
if line == nil then
return nil
end
local logentry = parsedglog_line(line)
logentries[#logentries+1] = logentry
if logentry.reason ~= "" then
if logentry.shortreason == "" then
logentry.shortreason = logentry.reason
end
logentry.score = string.match(logentry.reason, "^.*: ([0-9]+) ")
logentry.logdatetime = string.gsub(logentry.logdatetime, "%.", "-")
return logentry
end
end
end
return logentries
end
-- ################################################################################
......@@ -846,11 +878,12 @@ end
-- import either squid or dg log file.
-- delete logfile after
function importlogfile(source, cookiesfile, file, parselog_func, importlog_func)
function importlogfile(source, cookiesfile, file, parselog_iter, importlog_func)
local logentries
logme("Processing " .. file )
logme("Getting " .. file )
loghandle = openlogfile(source, cookiesfile, file)
logentries = parselog_func(loghandle)
logentries = parselog_iter(loghandle)
importlog_func(logentries, source.sourcename)
loghandle:close()
logme("Deleting " .. file )
......@@ -876,10 +909,10 @@ function importlogs()
for j,file in ipairs(files) do
if string.match(file, "dansguardian/access%.log[%.%-]") then
count = count + 1
importlogfile(source, cookeisfile, file, parsedglog, importdglog)
importlogfile(source, cookeisfile, file, parsedglog_iter, importdglog)
elseif string.match(file, "squid/access%.log[%.%-]") then
count = count + 1
importlogfile(source, cookeisfile, file, parsesquidlog, importsquidlog)
importlogfile(source, cookeisfile, file, parsesquidlog_iter, importsquidlog)
end
end
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment