#!/usr/bin/env python __all__ = ['find_parameters'] import os, sys, traceback, getpass, time, re from threading import Thread from subprocess import * if sys.version_info[0] < 3: from Queue import Queue else: from queue import Queue telnet_workers = [] ssh_workers = [] nr_local_worker = 1 class GridOption: def __init__(self, dataset_pathname, options): dirname = os.path.dirname(__file__) if sys.platform != 'win32': self.svmtrain_pathname = os.path.join(dirname, 'libsvm-weights-3.20/svm-train') self.gnuplot_pathname = '/usr/bin/gnuplot' else: # example for windows self.svmtrain_pathname = os.path.join(dirname, r'libsvm-weights-3.20\windows\svm-train.exe') # svmtrain_pathname = r'c:\Program Files\libsvm\windows\svm-train.exe' self.gnuplot_pathname = r'c:\tmp\gnuplot\binary\pgnuplot.exe' self.fold = 5 self.c_begin, self.c_end, self.c_step = -5, 15, 2 self.g_begin, self.g_end, self.g_step = 3, -15, -2 self.grid_with_c, self.grid_with_g = True, True self.dataset_pathname = dataset_pathname self.dataset_title = os.path.split(dataset_pathname)[1] self.out_pathname = '{0}.out'.format(self.dataset_title) self.png_pathname = '{0}.png'.format(self.dataset_title) self.pass_through_string = ' ' self.resume_pathname = None self.parse_options(options) def parse_options(self, options): if type(options) == str: options = options.split() i = 0 pass_through_options = [] while i < len(options): if options[i] == '-log2c': i = i + 1 if options[i] == 'null': self.grid_with_c = False else: self.c_begin, self.c_end, self.c_step = map(float,options[i].split(',')) elif options[i] == '-log2g': i = i + 1 if options[i] == 'null': self.grid_with_g = False else: self.g_begin, self.g_end, self.g_step = map(float,options[i].split(',')) elif options[i] == '-v': i = i + 1 self.fold = options[i] elif options[i] in ('-c','-g'): raise ValueError('Use -log2c and -log2g.') elif options[i] == '-svmtrain': i = i + 1 self.svmtrain_pathname = options[i] elif options[i] == '-gnuplot': i = i + 1 if options[i] == 'null': self.gnuplot_pathname = None else: self.gnuplot_pathname = options[i] elif options[i] == '-out': i = i + 1 if options[i] == 'null': self.out_pathname = None else: self.out_pathname = options[i] elif options[i] == '-png': i = i + 1 self.png_pathname = options[i] elif options[i] == '-resume': if i == (len(options)-1) or options[i+1].startswith('-'): self.resume_pathname = self.dataset_title + '.out' else: i = i + 1 self.resume_pathname = options[i] else: pass_through_options.append(options[i]) i = i + 1 self.pass_through_string = ' '.join(pass_through_options) if not os.path.exists(self.svmtrain_pathname): raise IOError('svm-train executable not found') if not os.path.exists(self.dataset_pathname): raise IOError('dataset not found') if self.resume_pathname and not os.path.exists(self.resume_pathname): raise IOError('file for resumption not found') if not self.grid_with_c and not self.grid_with_g: raise ValueError('-log2c and -log2g should not be null simultaneously') if self.gnuplot_pathname and not os.path.exists(self.gnuplot_pathname): sys.stderr.write('gnuplot executable not found\n') self.gnuplot_pathname = None def redraw(db,best_param,gnuplot,options,tofile=False): if len(db) == 0: return begin_level = round(max(x[2] for x in db)) - 3 step_size = 0.5 best_log2c,best_log2g,best_rate = best_param # if newly obtained c, g, or cv values are the same, # then stop redrawing the contour. if all(x[0] == db[0][0] for x in db): return if all(x[1] == db[0][1] for x in db): return if all(x[2] == db[0][2] for x in db): return if tofile: gnuplot.write(b"set term png transparent small linewidth 2 medium enhanced\n") gnuplot.write("set output \"{0}\"\n".format(options.png_pathname.replace('\\','\\\\')).encode()) #gnuplot.write(b"set term postscript color solid\n") #gnuplot.write("set output \"{0}.ps\"\n".format(options.dataset_title).encode().encode()) elif sys.platform == 'win32': gnuplot.write(b"set term windows\n") else: gnuplot.write( b"set term x11\n") gnuplot.write(b"set xlabel \"log2(C)\"\n") gnuplot.write(b"set ylabel \"log2(gamma)\"\n") gnuplot.write("set xrange [{0}:{1}]\n".format(options.c_begin,options.c_end).encode()) gnuplot.write("set yrange [{0}:{1}]\n".format(options.g_begin,options.g_end).encode()) gnuplot.write(b"set contour\n") gnuplot.write("set cntrparam levels incremental {0},{1},100\n".format(begin_level,step_size).encode()) gnuplot.write(b"unset surface\n") gnuplot.write(b"unset ztics\n") gnuplot.write(b"set view 0,0\n") gnuplot.write("set title \"{0}\"\n".format(options.dataset_title).encode()) gnuplot.write(b"unset label\n") gnuplot.write("set label \"Best log2(C) = {0} log2(gamma) = {1} accuracy = {2}%\" \ at screen 0.5,0.85 center\n". \ format(best_log2c, best_log2g, best_rate).encode()) gnuplot.write("set label \"C = {0} gamma = {1}\"" " at screen 0.5,0.8 center\n".format(2**best_log2c, 2**best_log2g).encode()) gnuplot.write(b"set key at screen 0.9,0.9\n") gnuplot.write(b"splot \"-\" with lines\n") db.sort(key = lambda x:(x[0], -x[1])) prevc = db[0][0] for line in db: if prevc != line[0]: gnuplot.write(b"\n") prevc = line[0] gnuplot.write("{0[0]} {0[1]} {0[2]}\n".format(line).encode()) gnuplot.write(b"e\n") gnuplot.write(b"\n") # force gnuplot back to prompt when term set failure gnuplot.flush() def calculate_jobs(options): def range_f(begin,end,step): # like range, but works on non-integer too seq = [] while True: if step > 0 and begin > end: break if step < 0 and begin < end: break seq.append(begin) begin = begin + step return seq def permute_sequence(seq): n = len(seq) if n <= 1: return seq mid = int(n/2) left = permute_sequence(seq[:mid]) right = permute_sequence(seq[mid+1:]) ret = [seq[mid]] while left or right: if left: ret.append(left.pop(0)) if right: ret.append(right.pop(0)) return ret c_seq = permute_sequence(range_f(options.c_begin,options.c_end,options.c_step)) g_seq = permute_sequence(range_f(options.g_begin,options.g_end,options.g_step)) if not options.grid_with_c: c_seq = [None] if not options.grid_with_g: g_seq = [None] nr_c = float(len(c_seq)) nr_g = float(len(g_seq)) i, j = 0, 0 jobs = [] while i < nr_c or j < nr_g: if i/nr_c < j/nr_g: # increase C resolution line = [] for k in range(0,j): line.append((c_seq[i],g_seq[k])) i = i + 1 jobs.append(line) else: # increase g resolution line = [] for k in range(0,i): line.append((c_seq[k],g_seq[j])) j = j + 1 jobs.append(line) resumed_jobs = {} if options.resume_pathname is None: return jobs, resumed_jobs for line in open(options.resume_pathname, 'r'): line = line.strip() rst = re.findall(r'rate=([0-9.]+)',line) if not rst: continue rate = float(rst[0]) c, g = None, None rst = re.findall(r'log2c=([0-9.-]+)',line) if rst: c = float(rst[0]) rst = re.findall(r'log2g=([0-9.-]+)',line) if rst: g = float(rst[0]) resumed_jobs[(c,g)] = rate return jobs, resumed_jobs class WorkerStopToken: # used to notify the worker to stop or if a worker is dead pass class Worker(Thread): def __init__(self,name,job_queue,result_queue,options): Thread.__init__(self) self.name = name self.job_queue = job_queue self.result_queue = result_queue self.options = options def run(self): while True: (cexp,gexp) = self.job_queue.get() if cexp is WorkerStopToken: self.job_queue.put((cexp,gexp)) # print('worker {0} stop.'.format(self.name)) break try: c, g = None, None if cexp != None: c = 2.0**cexp if gexp != None: g = 2.0**gexp rate = self.run_one(c,g) if rate is None: raise RuntimeError('get no rate') except: # we failed, let others do that and we just quit traceback.print_exception(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2]) self.job_queue.put((cexp,gexp)) sys.stderr.write('worker {0} quit.\n'.format(self.name)) break else: self.result_queue.put((self.name,cexp,gexp,rate)) def get_cmd(self,c,g): options=self.options cmdline = '"' + options.svmtrain_pathname + '"' if options.grid_with_c: cmdline += ' -c {0} '.format(c) if options.grid_with_g: cmdline += ' -g {0} '.format(g) cmdline += ' -v {0} {1} {2} '.format\ (options.fold,options.pass_through_string,options.dataset_pathname) return cmdline class LocalWorker(Worker): def run_one(self,c,g): cmdline = self.get_cmd(c,g) result = Popen(cmdline,shell=True,stdout=PIPE,stderr=PIPE,stdin=PIPE).stdout for line in result.readlines(): if str(line).find('Cross') != -1: return float(line.split()[-1][0:-1]) class SSHWorker(Worker): def __init__(self,name,job_queue,result_queue,host,options): Worker.__init__(self,name,job_queue,result_queue,options) self.host = host self.cwd = os.getcwd() def run_one(self,c,g): cmdline = 'ssh -x -t -t {0} "cd {1}; {2}"'.format\ (self.host,self.cwd,self.get_cmd(c,g)) result = Popen(cmdline,shell=True,stdout=PIPE,stderr=PIPE,stdin=PIPE).stdout for line in result.readlines(): if str(line).find('Cross') != -1: return float(line.split()[-1][0:-1]) class TelnetWorker(Worker): def __init__(self,name,job_queue,result_queue,host,username,password,options): Worker.__init__(self,name,job_queue,result_queue,options) self.host = host self.username = username self.password = password def run(self): import telnetlib self.tn = tn = telnetlib.Telnet(self.host) tn.read_until('login: ') tn.write(self.username + '\n') tn.read_until('Password: ') tn.write(self.password + '\n') # XXX: how to know whether login is successful? tn.read_until(self.username) # print('login ok', self.host) tn.write('cd '+os.getcwd()+'\n') Worker.run(self) tn.write('exit\n') def run_one(self,c,g): cmdline = self.get_cmd(c,g) result = self.tn.write(cmdline+'\n') (idx,matchm,output) = self.tn.expect(['Cross.*\n']) for line in output.split('\n'): if str(line).find('Cross') != -1: return float(line.split()[-1][0:-1]) def find_parameters(dataset_pathname, options=''): def update_param(c,g,rate,best_c,best_g,best_rate,worker,resumed): if (rate > best_rate) or (rate==best_rate and g==best_g and c<best_c): best_rate,best_c,best_g = rate,c,g stdout_str = '[{0}] {1} {2} (best '.format\ (worker,' '.join(str(x) for x in [c,g] if x is not None),rate) output_str = '' if c != None: stdout_str += 'c={0}, '.format(2.0**best_c) output_str += 'log2c={0} '.format(c) if g != None: stdout_str += 'g={0}, '.format(2.0**best_g) output_str += 'log2g={0} '.format(g) stdout_str += 'rate={0})'.format(best_rate) print(stdout_str) if options.out_pathname and not resumed: output_str += 'rate={0}\n'.format(rate) result_file.write(output_str) result_file.flush() return best_c,best_g,best_rate options = GridOption(dataset_pathname, options); if options.gnuplot_pathname: gnuplot = Popen(options.gnuplot_pathname,stdin = PIPE,stdout=PIPE,stderr=PIPE).stdin else: gnuplot = None # put jobs in queue jobs,resumed_jobs = calculate_jobs(options) job_queue = Queue(0) result_queue = Queue(0) for (c,g) in resumed_jobs: result_queue.put(('resumed',c,g,resumed_jobs[(c,g)])) for line in jobs: for (c,g) in line: if (c,g) not in resumed_jobs: job_queue.put((c,g)) # hack the queue to become a stack -- # this is important when some thread # failed and re-put a job. It we still # use FIFO, the job will be put # into the end of the queue, and the graph # will only be updated in the end job_queue._put = job_queue.queue.appendleft # fire telnet workers if telnet_workers: nr_telnet_worker = len(telnet_workers) username = getpass.getuser() password = getpass.getpass() for host in telnet_workers: worker = TelnetWorker(host,job_queue,result_queue, host,username,password,options) worker.start() # fire ssh workers if ssh_workers: for host in ssh_workers: worker = SSHWorker(host,job_queue,result_queue,host,options) worker.start() # fire local workers for i in range(nr_local_worker): worker = LocalWorker('local',job_queue,result_queue,options) worker.start() # gather results done_jobs = {} if options.out_pathname: if options.resume_pathname: result_file = open(options.out_pathname, 'a') else: result_file = open(options.out_pathname, 'w') db = [] best_rate = -1 best_c,best_g = None,None for (c,g) in resumed_jobs: rate = resumed_jobs[(c,g)] best_c,best_g,best_rate = update_param(c,g,rate,best_c,best_g,best_rate,'resumed',True) for line in jobs: for (c,g) in line: while (c,g) not in done_jobs: (worker,c1,g1,rate1) = result_queue.get() done_jobs[(c1,g1)] = rate1 if (c1,g1) not in resumed_jobs: best_c,best_g,best_rate = update_param(c1,g1,rate1,best_c,best_g,best_rate,worker,False) db.append((c,g,done_jobs[(c,g)])) if gnuplot and options.grid_with_c and options.grid_with_g: redraw(db,[best_c, best_g, best_rate],gnuplot,options) redraw(db,[best_c, best_g, best_rate],gnuplot,options,True) if options.out_pathname: result_file.close() job_queue.put((WorkerStopToken,None)) best_param, best_cg = {}, [] if best_c != None: best_param['c'] = 2.0**best_c best_cg += [2.0**best_c] if best_g != None: best_param['g'] = 2.0**best_g best_cg += [2.0**best_g] print('{0} {1}'.format(' '.join(map(str,best_cg)), best_rate)) return best_rate, best_param if __name__ == '__main__': def exit_with_help(): print("""\ Usage: grid.py [grid_options] [svm_options] dataset grid_options : -log2c {begin,end,step | "null"} : set the range of c (default -5,15,2) begin,end,step -- c_range = 2^{begin,...,begin+k*step,...,end} "null" -- do not grid with c -log2g {begin,end,step | "null"} : set the range of g (default 3,-15,-2) begin,end,step -- g_range = 2^{begin,...,begin+k*step,...,end} "null" -- do not grid with g -v n : n-fold cross validation (default 5) -svmtrain pathname : set svm executable path and name -gnuplot {pathname | "null"} : pathname -- set gnuplot executable path and name "null" -- do not plot -out {pathname | "null"} : (default dataset.out) pathname -- set output file path and name "null" -- do not output file -png pathname : set graphic output file path and name (default dataset.png) -resume [pathname] : resume the grid task using an existing output file (default pathname is dataset.out) This is experimental. Try this option only if some parameters have been checked for the SAME data. svm_options : additional options for svm-train""") sys.exit(1) if len(sys.argv) < 2: exit_with_help() dataset_pathname = sys.argv[-1] options = sys.argv[1:-1] try: find_parameters(dataset_pathname, options) except (IOError,ValueError) as e: sys.stderr.write(str(e) + '\n') sys.stderr.write('Try "grid.py" for more information.\n') sys.exit(1)