Johann Mitloehner, 2018
import gzip
from nltk import word_tokenize
import numpy as np
import sys
def readglove():
print 'reading glove..'
glove = {}
for line in gzip.open('glove.txt.gz'):
wds = line.strip().split()
glove[wds[0]] = [ float(x) for x in wds[1:] ]
print 'done.'
return glove
# retrieve word embedding, or return 0s if not found
def emb(glove, w):
if w in glove: return glove[w]
else: return [ 0.0 for x in glove['the'] ]
def embtoks(glove, toks):
e = [ glove[t] for t in toks if t in glove ]
if len(e) == 0:
return np.asarray([ 0.0 for x in glove['the']])
else:
return np.asarray(e) / len(e)
# read tweets from file and create numeric X, y training data
def embed(filename):
glove = readglove()
X = []
y = []
sents = { 'negative': 0, 'neutral': 1, 'positive': 2 }
print "embedding.."
for line in open(filename):
wds = line.strip().split("\t")
if len(wds) == 3:
sent = wds[1]
tweet = wds[2].strip('"')
e = np.asarray([ 0.0 for x in glove['the'] ])
toks = word_tokenize(tweet)
n = 1
for w in [ x.lower() for x in toks ]:
if w in glove:
e += glove[w]
n += 1
X += [ e/n ] # embedded tweet
y += [ sents[sent] ] # encoded sentiment
if len(X) <= 10: # check a few lines
print 'tweet:', line[:60]
print 'embedding:', X[-1][:5], '... sentiment:', y[-1]
print "done."
return np.asarray(X), np.asarray(y)
x, y = embed('tweets.txt')
Start with two-layer feed-foward net
First layer: dense with ReLU activation
Second layer: dense with softmax
# return class with highest prob
def softpred(x):
return [ np.argmax(pr) for pr in x ]
def netpred(X, W, W2):
hid = np.maximum(0, np.dot(X, W))
escor = np.exp(np.dot(hid, W2))
return escor / np.sum(escor, axis=1, keepdims=True), hid
little helper function for nice printing of accuracy
# % correct classifications
def accur(scor, y):
return '%.1f' % ((100.0 * sum(softpred(scor) == y )) / len(y))
# introduce hidden layer, SGD
def netsoftmax(dataset, h, stepsize=0.5, steps=50):
X_, y_ = dataset
reg = 0.001
D = len(X_[0])
K = len(set(y_))
print 'netsoftmax: number of classes =', K
W = 0.1 * np.random.randn(D,h)
W2 = 0.1 * np.random.randn(h, K)
for step in range(steps):
# minibatch
ix = np.random.choice(len(y_), min(200, len(y_)), replace=False)
y = y_[ix]
X = X_[ix]
probs, hid = netpred(X, W, W2)
if (step % 1000) == 0: print 'accuracy on training set:', accur(probs, y)
# gradient
dscor = probs
dscor[range(len(y)), y] -= 1
dscor /= len(y)
# backprop
dW2 = np.dot(hid.T, dscor)
dhid = np.dot(dscor, W2.T)
dhid[hid <= 0] = 0
dW = np.dot(X.T, dhid)
dW += reg * W
dW2 += reg * W2
# parameter update in the negative gradient direction to decrease loss
W += -stepsize * dW
W2 += -stepsize * dW2
probs, hid = netpred(X_, W, W2)
print 'accuracy on training set: ', accur(probs, y_)
return W, W2
# read embeddings and tweets, train sentiment
def mainsoftmax():
n = int(len(x) * 0.8)
x_train, y_train, x_val, y_val = x[:n], y[:n], x[n:], y[n:]
print "size of training set:", len(x_train)
print 'training netsoftmax with ReLU units in layer 1:'
W, W2 = netsoftmax((x_train, y_train), 200, steps=10000, stepsize=0.1)
probs, hid = netpred(x_val, W, W2)
print 'accuracy on validation set: ', accur(probs, y_val)
np.random.seed(1337)
mainsoftmax()
import keras
#from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import RMSprop, SGD
import sys
import numpy as np
np.random.seed(1337)
batch_size = 128
epochs = 20
num_classes = len(set(y))
n = int(len(x) * 0.8)
x_train, y_train, x_val, y_val = x[:n], y[:n], x[n:], y[n:]
print 'training data sample:'
for i in range(5):
print x_train[i][:3], '... ', y_train[i]
print 'size of training set: ', x_train.shape[0]
print 'size of validation set:', x_val.shape[0]
# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_val = keras.utils.to_categorical(y_val, num_classes)
model = Sequential()
model.add(Dense(200, activation='relu', input_shape=(200,)))
model.add(Dropout(0.2))
model.add(Dense(200, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer=RMSprop(),
metrics=['accuracy'])
print 'train model..'
history = model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs,
verbose=1,
validation_data=(x_val, y_val))
score = model.evaluate(x_val, y_val, verbose=0)
print 'accuracy on validation set:', score[1]