{"The","Thecat","cat","catsat","sat",
"saton","on","onthe","the","themat","mat"}
{"The","Thecat","cat","catsat","Thecatsat",
"sat","saton","on","catsaton","onthe","the",
"satonthe","themat","mat","onthemat"}
importnumpyasnp
samples=['Thecatsatonthemat.','Thedogatemyhomework.']
token_index={}
forsampleinsamples:
forwordinsample.split():
ifwordnotintoken_index:
token_index[word]=len(token_index)+1
max_length=10
results=np.zeros(shape=(len(samples),
max_length,
max(token_index.values())+1))
fori,sampleinenumerate(samples):
forj,wordinlist(enumerate(sample.split()))[:max_length]:
index=token_index.get(word)
results[i,j,index]=1.
print(results)
[[[0.1.0.0.0.0.0.0.0.0.0.]
[0.0.1.0.0.0.0.0.0.0.0.]
[0.0.0.1.0.0.0.0.0.0.0.]
[0.0.0.0.1.0.0.0.0.0.0.]
[0.0.0.0.0.1.0.0.0.0.0.]
[0.0.0.0.0.0.1.0.0.0.0.]
[0.0.0.0.0.0.0.0.0.0.0.]
[0.0.0.0.0.0.0.0.0.0.0.]]
[[0.1.0.0.0.0.0.0.0.0.0.]
[0.0.0.0.0.0.0.1.0.0.0.]
[0.0.0.0.0.0.0.0.1.0.0.]
[0.0.0.0.0.0.0.0.0.1.0.]
[0.0.0.0.0.0.0.0.0.0.1.]
[0.0.0.0.0.0.0.0.0.0.0.]]]
importstring
token_index=dict(zip(range(1,len(characters)+1),characters))
max_length=50
results=np.zeros((len(samples),max_length,max(token_index.keys())+1))
forj,characterinenumerate(sample):
index=token_index.get(character)
[[[1.1.1....1.1.1.]
[1.1.1....1.1.1.]
...
[0.0.0....0.0.0.]
[0.0.0....0.0.0.]]
[[1.1.1....1.1.1.]
[0.0.0....0.0.0.]]]
fromtensorflow.keras.preprocessing.textimportTokenizer
tokenizer.fit_on_texts(samples)
print('sequences:',sequences)
print(f'one_hot_results:shape={one_hot_results.shape}:\n',one_hot_results,)
print(f'Found{len(word_index)}uniquetokens.','word_index:',word_index)
sequences:[[1,2,3,4,1,5],[1,6,7,8,9]]
one_hot_results:shape=(2,1000):
[[0.1.1....0.0.0.]
[0.1.0....0.0.0.]]
Found9uniquetokens.word_index:{'the':1,'cat':2,'sat':3,'on':4,'mat':5,'dog':6,'ate':7,'my':8,'homework':9}
results=np.zeros((len(samples),max_length,dimensionality))
print(results.shape)
(2,10,1000)
[[[0.0.0....0.0.0.]
[[0.0.0....0.0.0.]
fromtensorflow.keras.layersimportEmbedding
fromtensorflow.keras.datasetsimportimdb
fromtensorflow.kerasimportpreprocessing
(x_train,y_train),(x_test,y_test)=imdb.load_data(num_words=max_features)
x_train=preprocessing.sequence.pad_sequences(x_train,maxlen=maxlen)
x_test=preprocessing.sequence.pad_sequences(x_test,maxlen=maxlen)
fromtensorflow.keras.modelsimportSequential
fromtensorflow.keras.layersimportEmbedding,Flatten,Dense
model=Sequential()
model.add(Embedding(10000,8,input_length=maxlen))#(samples,maxlen,8)
model.add(Flatten())#(samles,maxlen*8)
model.add(Dense(1,activation='sigmoid'))#topclassifier
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=['acc'])
model.summary()
history=model.fit(x_train,y_train,
epochs=10,
batch_size=32,
validation_split=0.2)
Model:"sequential_1"
_________________________________________________________________
Layer(type)OutputShapeParam#
=================================================================
embedding_2(Embedding)(None,20,8)80000
flatten_1(Flatten)(None,160)0
dense_1(Dense)(None,1)161
Totalparams:80,161
Trainableparams:80,161
Non-trainableparams:0
Epoch1/10
625/625[==============================]-1s1ms/step-loss:0.6686-acc:0.6145-val_loss:0.6152-val_acc:0.6952
Epoch10/10
625/625[==============================]-1s886us/step-loss:0.3017-acc:0.8766-val_loss:0.5260-val_acc:0.7508
aclImdb
importos
imdb_dir='/Volumes/WD/Files/dataset/aclImdb'
train_dir=os.path.join(imdb_dir,'train')
texts=[]
labels=[]
forlabel_typein['neg','pos']:
dir_name=os.path.join(train_dir,label_type)
forfnameinos.listdir(dir_name):
iffname.endswith('.txt'):
withopen(os.path.join(dir_name,fname))asf:
texts.append(f.read())
labels.append(0iflabel_type=='neg'else1)
print(labels[0],texts[0],sep='-->')
print(labels[-1],texts[-1],sep='-->')
print(len(texts),len(labels))
fromtensorflow.keras.preprocessing.sequenceimportpad_sequences
training_samples=200
validation_samples=10000
max_words=10000
tokenizer=Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences=tokenizer.texts_to_sequences(texts)
word_index=tokenizer.word_index
print(f'Found{len(word_index)}uniquetokens.')
data=pad_sequences(sequences,maxlen=maxlen)
labels=np.asarray(labels)
print('Shapeofdatatensor:',data.shape)
print('Shapeoflabeltensor:',labels.shape)
indices=np.arange(labels.shape[0])
np.random.shuffle(indices)
data=data[indices]
labels=labels[indices]
x_train=data[:training_samples]
y_train=labels[:training_samples]
x_val=data[training_samples:training_samples+validation_samples]
y_val=labels[training_samples:training_samples+validation_samples]
Found88582uniquetokens.
Shapeofdatatensor:(25000,100)
Shapeoflabeltensor:(25000,)
glove_dir='/Volumes/WD/Files/glove.6B'
embeddings_index={}
withopen(os.path.join(glove_dir,'glove.6B.100d.txt'))asf:
forlineinf:
values=line.split()
word=values[0]
coefs=np.asarray(values[1:],dtype='float32')
embeddings_index[word]=coefs
print(f'Found{len(embeddings_index)}wordvectors.')
Found400000wordvectors.
embedding_dim=100
embedding_matrix=np.zeros((max_words,embedding_dim))
forword,iinword_index.items():
ifi embedding_matrix[i]=embedding_vector print(embedding_matrix) [[0.0.0....0.0. 0.] [-0.038194-0.244870010.72812003...-0.14590.82779998 0.27061999] [-0.0719530.231270.023731...-0.718949970.86894 0.19539] [-0.440360010.318219990.10778...-1.298499940.11824 0.64845002] [0.0.0....0.0. [-0.54539001-0.31817999-0.016281...-0.448650.067047 0.17975999]] model.add(Embedding(max_words,embedding_dim,input_length=maxlen)) model.add(Flatten()) model.add(Dense(32,activation='relu')) model.add(Dense(1,activation='sigmoid')) Model:"sequential_2" embedding_3(Embedding)(None,100,100)1000000 flatten_2(Flatten)(None,10000)0 dense_2(Dense)(None,32)320032 dense_3(Dense)(None,1)33 Totalparams:1,320,065 Trainableparams:1,320,065 model.layers[0].set_weights([embedding_matrix]) model.layers[0].trainable=False validation_data=(x_val,y_val)) model.save_weights('pre_trained_glove_model.h5') importmatplotlib.pyplotasplt acc=history.history['acc'] val_acc=history.history['val_acc'] loss=history.history['loss'] val_loss=history.history['val_loss'] epochs=range(len(acc)) plt.plot(epochs,acc,'bo-',label='Trainingacc') plt.plot(epochs,val_acc,'rs-',label='Validationacc') plt.title('Trainingandvalidationaccuracy') plt.legend() plt.figure() plt.plot(epochs,loss,'bo-',label='Trainingloss') plt.plot(epochs,val_loss,'rs-',label='Validationloss')