UserPreferences

allnames.py


  1 
  2 
  3 
  4 
  5 
  6 
  7 
  8 
  9 
 10 
 11 
 12 
 13 
 14 
 15 
 16 
 17 
 18 
 19 
 20 
 21 
 22 
 23 
 24 
 25 
 26 
 27 
 28 
 29 
 30 
 31 
 32 
 33 
 34 
 35 
 36 
 37 
 38 
 39 
 40 
 41 
 42 
 43 
 44 
 45 
 46 
 47 
 48 
 49 
 50 
 51 
 52 
 53 
 54 
 55 
 56 
 57 
 58 
 59 
 60 
 61 
 62 
 63 
 64 
 65 
 66 
 67 
 68 
 69 
 70 
 71 
 72 
 73 
 74 
 75 
 76 
 77 
 78 
 79 
 80 
 81 
 82 
 83 
 84 
 85 
 86 
 87 
 88 
 89 
 90 
 91 
 92 
 93 
 94 
 95 
 96 
 97 
 98 
 99 
100 
101 
102 
103 
104 
105 
106 
107 
108 
109 
110 
111 
112 
113 
114 
115 
116 
117 
118 
119 
120 
121 
122 
123 

"""
This program will read and train a neural network from a CSV file.
"""

from pyrobot.brain.conx import *

def scale(vect):
    """
    This function will automatically create a scaling of all columns.
    """
    unique = [s for s in set(vect)]
    unique.sort()
    mapping = {}
    if len(unique) > 1:
        mapping.update( zip([val for val in unique], [i*1.0/(len(unique)-1) for i in range(len(unique))]) )
    else:
        #if there is only one thing in the category it can't matter too much
        mapping.update( [(unique.pop(), 0.0)] )
    return [mapping[v] for v in vect]

def transpose(table):
    """
    Returns the transpose of the table. This swaps cols for rows and rows for cols.
    """
    return [[table[j][i] for j in range(len(table))] for i in range(len(table[0]))]

def loadFile(filename, targets = [], inputs = [], names = [], lookup = {}):
    print "Loading data '%s'..." % filename
    lines = open(filename,'r').readlines()
    lines = lines[1:] # remove the headers
    # already removed headers
    lineLength = len(lines[0].split())
    n = 1
    count = 0
    for rawLine in lines:
        line = rawLine.split()
        name = line[0].replace('"', '') # remove quotes if NAME has them
        if len(line) == lineLength:
            if name not in names:
                if tuple(line[2:]) in lookup:
                    print "   WARNING: duplicate input line #%d: %s same as %s" % (n, name, lookup[tuple(line[2:])])
                names.append(name)
                targets.append(line[1])
                inputs.append(line[2:])
                lookup[tuple(line[2:])] = lookup.get(tuple(line[2:]), "") + name + ","
                count += 1
            else:
                print "ERROR: duplicate name; Skipping line #%d: name: %s" % (n, name)
        else:
            print "ERROR: invalid data; Skipping line #%d: name: %s" % (n, name)
        n += 1
    print "Done with '%s'; read %d names..." % (filename, count)
    return targets, inputs, names, lookup, count

def loadData(trainfile, testfile):
    """
    Loads the data from trainfile and testfile.
    """
    # First read each CSV file:
    targets, inputs, names, lookup, traincount = loadFile(trainfile)
    if testfile != "":
        targets, inputs, names, lookup, testcount = loadFile(testfile, targets, inputs, names, lookup)
    else:
        testcount = 0
    # Next, make the scaled codes for each column:
    targets = [[val] for val in scale(targets)]
    inputs = transpose([scale(col) for col in transpose(inputs)])
    # Next, make the patterns
    patterns = {}
    for n in range(len(inputs)):
        name = names[n]
        patterns[name] = inputs[n]
    # Patterns for output:
    patterns["female"] = [0.0]
    patterns["male"] = [1.0]
    print "Done loading all data!"
    maleCount = len([x for x in targets if x[0] == 1])
    femaleCount = len([x for x in targets if x[0] == 0])
    print "Males: %d Females: %d" % (maleCount, femaleCount)
    return (inputs, targets, patterns, traincount, testcount)

##############################################################################
# Here is where you will make all of your changes
##############################################################################
# The names of the files (use "" if no testfile):
trainfile = "allnames.csv"
testfile  = "testnames.csv"
# Load the data:
inputs, targets, patterns, traincount, testcount = loadData(trainfile, testfile)
inputs, testInputs   = inputs[:traincount], inputs[traincount:]
targets, testTargets = targets[:traincount], targets[traincount:]
##############################################################################
# The rest is the network code:
print "Building network..."
net = Network()
net.setAutoCrossValidation(1)  # turns auto cross validation on
net.addLayers(len(inputs[0]), 16, len(targets[0]))   # input, hidden, and output layer sizes
# 16 in the line above represents the number of hidden layers
net.setInputs(inputs)
net.setTargets(targets)
# Parameters:
net.tolerance   = 0.4  # within this amount to be considered correct
net.epsilon     = 0.1  # learning rate
net.momentum    = 0.9  # momentum
net.reportRate  = 1    # how often to report
net.stopPercent = 0.85 # percentage to get correct
net.useCrossValidationToStop = 1 # 1 = yes, use CV to stop; 0 = use normal TSS
##############################################################################
print "Training..."
net.train(10)       # provide a number to test that number of times
##############################################################################
# All Done!, let's test:
net.learning = 0
net.interactive = 1
net.setPatterns(patterns)
net.sweep()

# Now, let's test the test data:
net.interactive = 1
net.setInputs(testInputs)
net.setTargets(testTargets)
net.sweep()