501 lines
16 KiB
Python
501 lines
16 KiB
Python
#!/usr/bin/env python
|
|
__all__ = ['find_parameters']
|
|
|
|
import os, sys, traceback, getpass, time, re
|
|
from threading import Thread
|
|
from subprocess import *
|
|
|
|
if sys.version_info[0] < 3:
|
|
from Queue import Queue
|
|
else:
|
|
from queue import Queue
|
|
|
|
telnet_workers = []
|
|
ssh_workers = []
|
|
nr_local_worker = 1
|
|
|
|
class GridOption:
|
|
def __init__(self, dataset_pathname, options):
|
|
dirname = os.path.dirname(__file__)
|
|
if sys.platform != 'win32':
|
|
self.svmtrain_pathname = os.path.join(dirname, 'libsvm-weights-3.20/svm-train')
|
|
self.gnuplot_pathname = '/usr/bin/gnuplot'
|
|
else:
|
|
# example for windows
|
|
self.svmtrain_pathname = os.path.join(dirname, r'libsvm-weights-3.20\windows\svm-train.exe')
|
|
# svmtrain_pathname = r'c:\Program Files\libsvm\windows\svm-train.exe'
|
|
self.gnuplot_pathname = r'c:\tmp\gnuplot\binary\pgnuplot.exe'
|
|
self.fold = 5
|
|
self.c_begin, self.c_end, self.c_step = -5, 15, 2
|
|
self.g_begin, self.g_end, self.g_step = 3, -15, -2
|
|
self.grid_with_c, self.grid_with_g = True, True
|
|
self.dataset_pathname = dataset_pathname
|
|
self.dataset_title = os.path.split(dataset_pathname)[1]
|
|
self.out_pathname = '{0}.out'.format(self.dataset_title)
|
|
self.png_pathname = '{0}.png'.format(self.dataset_title)
|
|
self.pass_through_string = ' '
|
|
self.resume_pathname = None
|
|
self.parse_options(options)
|
|
|
|
def parse_options(self, options):
|
|
if type(options) == str:
|
|
options = options.split()
|
|
i = 0
|
|
pass_through_options = []
|
|
|
|
while i < len(options):
|
|
if options[i] == '-log2c':
|
|
i = i + 1
|
|
if options[i] == 'null':
|
|
self.grid_with_c = False
|
|
else:
|
|
self.c_begin, self.c_end, self.c_step = map(float,options[i].split(','))
|
|
elif options[i] == '-log2g':
|
|
i = i + 1
|
|
if options[i] == 'null':
|
|
self.grid_with_g = False
|
|
else:
|
|
self.g_begin, self.g_end, self.g_step = map(float,options[i].split(','))
|
|
elif options[i] == '-v':
|
|
i = i + 1
|
|
self.fold = options[i]
|
|
elif options[i] in ('-c','-g'):
|
|
raise ValueError('Use -log2c and -log2g.')
|
|
elif options[i] == '-svmtrain':
|
|
i = i + 1
|
|
self.svmtrain_pathname = options[i]
|
|
elif options[i] == '-gnuplot':
|
|
i = i + 1
|
|
if options[i] == 'null':
|
|
self.gnuplot_pathname = None
|
|
else:
|
|
self.gnuplot_pathname = options[i]
|
|
elif options[i] == '-out':
|
|
i = i + 1
|
|
if options[i] == 'null':
|
|
self.out_pathname = None
|
|
else:
|
|
self.out_pathname = options[i]
|
|
elif options[i] == '-png':
|
|
i = i + 1
|
|
self.png_pathname = options[i]
|
|
elif options[i] == '-resume':
|
|
if i == (len(options)-1) or options[i+1].startswith('-'):
|
|
self.resume_pathname = self.dataset_title + '.out'
|
|
else:
|
|
i = i + 1
|
|
self.resume_pathname = options[i]
|
|
else:
|
|
pass_through_options.append(options[i])
|
|
i = i + 1
|
|
|
|
self.pass_through_string = ' '.join(pass_through_options)
|
|
if not os.path.exists(self.svmtrain_pathname):
|
|
raise IOError('svm-train executable not found')
|
|
if not os.path.exists(self.dataset_pathname):
|
|
raise IOError('dataset not found')
|
|
if self.resume_pathname and not os.path.exists(self.resume_pathname):
|
|
raise IOError('file for resumption not found')
|
|
if not self.grid_with_c and not self.grid_with_g:
|
|
raise ValueError('-log2c and -log2g should not be null simultaneously')
|
|
if self.gnuplot_pathname and not os.path.exists(self.gnuplot_pathname):
|
|
sys.stderr.write('gnuplot executable not found\n')
|
|
self.gnuplot_pathname = None
|
|
|
|
def redraw(db,best_param,gnuplot,options,tofile=False):
|
|
if len(db) == 0: return
|
|
begin_level = round(max(x[2] for x in db)) - 3
|
|
step_size = 0.5
|
|
|
|
best_log2c,best_log2g,best_rate = best_param
|
|
|
|
# if newly obtained c, g, or cv values are the same,
|
|
# then stop redrawing the contour.
|
|
if all(x[0] == db[0][0] for x in db): return
|
|
if all(x[1] == db[0][1] for x in db): return
|
|
if all(x[2] == db[0][2] for x in db): return
|
|
|
|
if tofile:
|
|
gnuplot.write(b"set term png transparent small linewidth 2 medium enhanced\n")
|
|
gnuplot.write("set output \"{0}\"\n".format(options.png_pathname.replace('\\','\\\\')).encode())
|
|
#gnuplot.write(b"set term postscript color solid\n")
|
|
#gnuplot.write("set output \"{0}.ps\"\n".format(options.dataset_title).encode().encode())
|
|
elif sys.platform == 'win32':
|
|
gnuplot.write(b"set term windows\n")
|
|
else:
|
|
gnuplot.write( b"set term x11\n")
|
|
gnuplot.write(b"set xlabel \"log2(C)\"\n")
|
|
gnuplot.write(b"set ylabel \"log2(gamma)\"\n")
|
|
gnuplot.write("set xrange [{0}:{1}]\n".format(options.c_begin,options.c_end).encode())
|
|
gnuplot.write("set yrange [{0}:{1}]\n".format(options.g_begin,options.g_end).encode())
|
|
gnuplot.write(b"set contour\n")
|
|
gnuplot.write("set cntrparam levels incremental {0},{1},100\n".format(begin_level,step_size).encode())
|
|
gnuplot.write(b"unset surface\n")
|
|
gnuplot.write(b"unset ztics\n")
|
|
gnuplot.write(b"set view 0,0\n")
|
|
gnuplot.write("set title \"{0}\"\n".format(options.dataset_title).encode())
|
|
gnuplot.write(b"unset label\n")
|
|
gnuplot.write("set label \"Best log2(C) = {0} log2(gamma) = {1} accuracy = {2}%\" \
|
|
at screen 0.5,0.85 center\n". \
|
|
format(best_log2c, best_log2g, best_rate).encode())
|
|
gnuplot.write("set label \"C = {0} gamma = {1}\""
|
|
" at screen 0.5,0.8 center\n".format(2**best_log2c, 2**best_log2g).encode())
|
|
gnuplot.write(b"set key at screen 0.9,0.9\n")
|
|
gnuplot.write(b"splot \"-\" with lines\n")
|
|
|
|
db.sort(key = lambda x:(x[0], -x[1]))
|
|
|
|
prevc = db[0][0]
|
|
for line in db:
|
|
if prevc != line[0]:
|
|
gnuplot.write(b"\n")
|
|
prevc = line[0]
|
|
gnuplot.write("{0[0]} {0[1]} {0[2]}\n".format(line).encode())
|
|
gnuplot.write(b"e\n")
|
|
gnuplot.write(b"\n") # force gnuplot back to prompt when term set failure
|
|
gnuplot.flush()
|
|
|
|
|
|
def calculate_jobs(options):
|
|
|
|
def range_f(begin,end,step):
|
|
# like range, but works on non-integer too
|
|
seq = []
|
|
while True:
|
|
if step > 0 and begin > end: break
|
|
if step < 0 and begin < end: break
|
|
seq.append(begin)
|
|
begin = begin + step
|
|
return seq
|
|
|
|
def permute_sequence(seq):
|
|
n = len(seq)
|
|
if n <= 1: return seq
|
|
|
|
mid = int(n/2)
|
|
left = permute_sequence(seq[:mid])
|
|
right = permute_sequence(seq[mid+1:])
|
|
|
|
ret = [seq[mid]]
|
|
while left or right:
|
|
if left: ret.append(left.pop(0))
|
|
if right: ret.append(right.pop(0))
|
|
|
|
return ret
|
|
|
|
|
|
c_seq = permute_sequence(range_f(options.c_begin,options.c_end,options.c_step))
|
|
g_seq = permute_sequence(range_f(options.g_begin,options.g_end,options.g_step))
|
|
|
|
if not options.grid_with_c:
|
|
c_seq = [None]
|
|
if not options.grid_with_g:
|
|
g_seq = [None]
|
|
|
|
nr_c = float(len(c_seq))
|
|
nr_g = float(len(g_seq))
|
|
i, j = 0, 0
|
|
jobs = []
|
|
|
|
while i < nr_c or j < nr_g:
|
|
if i/nr_c < j/nr_g:
|
|
# increase C resolution
|
|
line = []
|
|
for k in range(0,j):
|
|
line.append((c_seq[i],g_seq[k]))
|
|
i = i + 1
|
|
jobs.append(line)
|
|
else:
|
|
# increase g resolution
|
|
line = []
|
|
for k in range(0,i):
|
|
line.append((c_seq[k],g_seq[j]))
|
|
j = j + 1
|
|
jobs.append(line)
|
|
|
|
resumed_jobs = {}
|
|
|
|
if options.resume_pathname is None:
|
|
return jobs, resumed_jobs
|
|
|
|
for line in open(options.resume_pathname, 'r'):
|
|
line = line.strip()
|
|
rst = re.findall(r'rate=([0-9.]+)',line)
|
|
if not rst:
|
|
continue
|
|
rate = float(rst[0])
|
|
|
|
c, g = None, None
|
|
rst = re.findall(r'log2c=([0-9.-]+)',line)
|
|
if rst:
|
|
c = float(rst[0])
|
|
rst = re.findall(r'log2g=([0-9.-]+)',line)
|
|
if rst:
|
|
g = float(rst[0])
|
|
|
|
resumed_jobs[(c,g)] = rate
|
|
|
|
return jobs, resumed_jobs
|
|
|
|
|
|
class WorkerStopToken: # used to notify the worker to stop or if a worker is dead
|
|
pass
|
|
|
|
class Worker(Thread):
|
|
def __init__(self,name,job_queue,result_queue,options):
|
|
Thread.__init__(self)
|
|
self.name = name
|
|
self.job_queue = job_queue
|
|
self.result_queue = result_queue
|
|
self.options = options
|
|
|
|
def run(self):
|
|
while True:
|
|
(cexp,gexp) = self.job_queue.get()
|
|
if cexp is WorkerStopToken:
|
|
self.job_queue.put((cexp,gexp))
|
|
# print('worker {0} stop.'.format(self.name))
|
|
break
|
|
try:
|
|
c, g = None, None
|
|
if cexp != None:
|
|
c = 2.0**cexp
|
|
if gexp != None:
|
|
g = 2.0**gexp
|
|
rate = self.run_one(c,g)
|
|
if rate is None: raise RuntimeError('get no rate')
|
|
except:
|
|
# we failed, let others do that and we just quit
|
|
|
|
traceback.print_exception(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2])
|
|
|
|
self.job_queue.put((cexp,gexp))
|
|
sys.stderr.write('worker {0} quit.\n'.format(self.name))
|
|
break
|
|
else:
|
|
self.result_queue.put((self.name,cexp,gexp,rate))
|
|
|
|
def get_cmd(self,c,g):
|
|
options=self.options
|
|
cmdline = '"' + options.svmtrain_pathname + '"'
|
|
if options.grid_with_c:
|
|
cmdline += ' -c {0} '.format(c)
|
|
if options.grid_with_g:
|
|
cmdline += ' -g {0} '.format(g)
|
|
cmdline += ' -v {0} {1} {2} '.format\
|
|
(options.fold,options.pass_through_string,options.dataset_pathname)
|
|
return cmdline
|
|
|
|
class LocalWorker(Worker):
|
|
def run_one(self,c,g):
|
|
cmdline = self.get_cmd(c,g)
|
|
result = Popen(cmdline,shell=True,stdout=PIPE,stderr=PIPE,stdin=PIPE).stdout
|
|
for line in result.readlines():
|
|
if str(line).find('Cross') != -1:
|
|
return float(line.split()[-1][0:-1])
|
|
|
|
class SSHWorker(Worker):
|
|
def __init__(self,name,job_queue,result_queue,host,options):
|
|
Worker.__init__(self,name,job_queue,result_queue,options)
|
|
self.host = host
|
|
self.cwd = os.getcwd()
|
|
def run_one(self,c,g):
|
|
cmdline = 'ssh -x -t -t {0} "cd {1}; {2}"'.format\
|
|
(self.host,self.cwd,self.get_cmd(c,g))
|
|
result = Popen(cmdline,shell=True,stdout=PIPE,stderr=PIPE,stdin=PIPE).stdout
|
|
for line in result.readlines():
|
|
if str(line).find('Cross') != -1:
|
|
return float(line.split()[-1][0:-1])
|
|
|
|
class TelnetWorker(Worker):
|
|
def __init__(self,name,job_queue,result_queue,host,username,password,options):
|
|
Worker.__init__(self,name,job_queue,result_queue,options)
|
|
self.host = host
|
|
self.username = username
|
|
self.password = password
|
|
def run(self):
|
|
import telnetlib
|
|
self.tn = tn = telnetlib.Telnet(self.host)
|
|
tn.read_until('login: ')
|
|
tn.write(self.username + '\n')
|
|
tn.read_until('Password: ')
|
|
tn.write(self.password + '\n')
|
|
|
|
# XXX: how to know whether login is successful?
|
|
tn.read_until(self.username)
|
|
#
|
|
print('login ok', self.host)
|
|
tn.write('cd '+os.getcwd()+'\n')
|
|
Worker.run(self)
|
|
tn.write('exit\n')
|
|
def run_one(self,c,g):
|
|
cmdline = self.get_cmd(c,g)
|
|
result = self.tn.write(cmdline+'\n')
|
|
(idx,matchm,output) = self.tn.expect(['Cross.*\n'])
|
|
for line in output.split('\n'):
|
|
if str(line).find('Cross') != -1:
|
|
return float(line.split()[-1][0:-1])
|
|
|
|
def find_parameters(dataset_pathname, options=''):
|
|
|
|
def update_param(c,g,rate,best_c,best_g,best_rate,worker,resumed):
|
|
if (rate > best_rate) or (rate==best_rate and g==best_g and c<best_c):
|
|
best_rate,best_c,best_g = rate,c,g
|
|
stdout_str = '[{0}] {1} {2} (best '.format\
|
|
(worker,' '.join(str(x) for x in [c,g] if x is not None),rate)
|
|
output_str = ''
|
|
if c != None:
|
|
stdout_str += 'c={0}, '.format(2.0**best_c)
|
|
output_str += 'log2c={0} '.format(c)
|
|
if g != None:
|
|
stdout_str += 'g={0}, '.format(2.0**best_g)
|
|
output_str += 'log2g={0} '.format(g)
|
|
stdout_str += 'rate={0})'.format(best_rate)
|
|
print(stdout_str)
|
|
if options.out_pathname and not resumed:
|
|
output_str += 'rate={0}\n'.format(rate)
|
|
result_file.write(output_str)
|
|
result_file.flush()
|
|
|
|
return best_c,best_g,best_rate
|
|
|
|
options = GridOption(dataset_pathname, options);
|
|
|
|
if options.gnuplot_pathname:
|
|
gnuplot = Popen(options.gnuplot_pathname,stdin = PIPE,stdout=PIPE,stderr=PIPE).stdin
|
|
else:
|
|
gnuplot = None
|
|
|
|
# put jobs in queue
|
|
|
|
jobs,resumed_jobs = calculate_jobs(options)
|
|
job_queue = Queue(0)
|
|
result_queue = Queue(0)
|
|
|
|
for (c,g) in resumed_jobs:
|
|
result_queue.put(('resumed',c,g,resumed_jobs[(c,g)]))
|
|
|
|
for line in jobs:
|
|
for (c,g) in line:
|
|
if (c,g) not in resumed_jobs:
|
|
job_queue.put((c,g))
|
|
|
|
# hack the queue to become a stack --
|
|
# this is important when some thread
|
|
# failed and re-put a job. It we still
|
|
# use FIFO, the job will be put
|
|
# into the end of the queue, and the graph
|
|
# will only be updated in the end
|
|
|
|
job_queue._put = job_queue.queue.appendleft
|
|
|
|
# fire telnet workers
|
|
|
|
if telnet_workers:
|
|
nr_telnet_worker = len(telnet_workers)
|
|
username = getpass.getuser()
|
|
password = getpass.getpass()
|
|
for host in telnet_workers:
|
|
worker = TelnetWorker(host,job_queue,result_queue,
|
|
host,username,password,options)
|
|
worker.start()
|
|
|
|
# fire ssh workers
|
|
|
|
if ssh_workers:
|
|
for host in ssh_workers:
|
|
worker = SSHWorker(host,job_queue,result_queue,host,options)
|
|
worker.start()
|
|
|
|
# fire local workers
|
|
|
|
for i in range(nr_local_worker):
|
|
worker = LocalWorker('local',job_queue,result_queue,options)
|
|
worker.start()
|
|
|
|
# gather results
|
|
|
|
done_jobs = {}
|
|
|
|
if options.out_pathname:
|
|
if options.resume_pathname:
|
|
result_file = open(options.out_pathname, 'a')
|
|
else:
|
|
result_file = open(options.out_pathname, 'w')
|
|
|
|
|
|
db = []
|
|
best_rate = -1
|
|
best_c,best_g = None,None
|
|
|
|
for (c,g) in resumed_jobs:
|
|
rate = resumed_jobs[(c,g)]
|
|
best_c,best_g,best_rate = update_param(c,g,rate,best_c,best_g,best_rate,'resumed',True)
|
|
|
|
for line in jobs:
|
|
for (c,g) in line:
|
|
while (c,g) not in done_jobs:
|
|
(worker,c1,g1,rate1) = result_queue.get()
|
|
done_jobs[(c1,g1)] = rate1
|
|
if (c1,g1) not in resumed_jobs:
|
|
best_c,best_g,best_rate = update_param(c1,g1,rate1,best_c,best_g,best_rate,worker,False)
|
|
db.append((c,g,done_jobs[(c,g)]))
|
|
if gnuplot and options.grid_with_c and options.grid_with_g:
|
|
redraw(db,[best_c, best_g, best_rate],gnuplot,options)
|
|
redraw(db,[best_c, best_g, best_rate],gnuplot,options,True)
|
|
|
|
|
|
if options.out_pathname:
|
|
result_file.close()
|
|
job_queue.put((WorkerStopToken,None))
|
|
best_param, best_cg = {}, []
|
|
if best_c != None:
|
|
best_param['c'] = 2.0**best_c
|
|
best_cg += [2.0**best_c]
|
|
if best_g != None:
|
|
best_param['g'] = 2.0**best_g
|
|
best_cg += [2.0**best_g]
|
|
print('{0} {1}'.format(' '.join(map(str,best_cg)), best_rate))
|
|
|
|
return best_rate, best_param
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
def exit_with_help():
|
|
print("""\
|
|
Usage: grid.py [grid_options] [svm_options] dataset
|
|
|
|
grid_options :
|
|
-log2c {begin,end,step | "null"} : set the range of c (default -5,15,2)
|
|
begin,end,step -- c_range = 2^{begin,...,begin+k*step,...,end}
|
|
"null" -- do not grid with c
|
|
-log2g {begin,end,step | "null"} : set the range of g (default 3,-15,-2)
|
|
begin,end,step -- g_range = 2^{begin,...,begin+k*step,...,end}
|
|
"null" -- do not grid with g
|
|
-v n : n-fold cross validation (default 5)
|
|
-svmtrain pathname : set svm executable path and name
|
|
-gnuplot {pathname | "null"} :
|
|
pathname -- set gnuplot executable path and name
|
|
"null" -- do not plot
|
|
-out {pathname | "null"} : (default dataset.out)
|
|
pathname -- set output file path and name
|
|
"null" -- do not output file
|
|
-png pathname : set graphic output file path and name (default dataset.png)
|
|
-resume [pathname] : resume the grid task using an existing output file (default pathname is dataset.out)
|
|
This is experimental. Try this option only if some parameters have been checked for the SAME data.
|
|
|
|
svm_options : additional options for svm-train""")
|
|
sys.exit(1)
|
|
|
|
if len(sys.argv) < 2:
|
|
exit_with_help()
|
|
dataset_pathname = sys.argv[-1]
|
|
options = sys.argv[1:-1]
|
|
try:
|
|
find_parameters(dataset_pathname, options)
|
|
except (IOError,ValueError) as e:
|
|
sys.stderr.write(str(e) + '\n')
|
|
sys.stderr.write('Try "grid.py" for more information.\n')
|
|
sys.exit(1)
|