self_example/TensorFlow_eaxmple/IMDB/traning.py

from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN
import matplotlib.pyplot as plt
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
import re
import os

import urllib.request
import os
import tarfile

# 下载数据集
url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
filepath = "data/aclImdb_v1.tar.gz"
if not os.path.isfile(filepath):
    result = urllib.request.urlretrieve(url, filepath)
    print('downloaded:', result)
# 解压
if not os.path.exists("data/aclImdb"):
    tfile = tarfile.open("data/aclImdb_v1.tar.gz", 'r:gz')
    result = tfile.extractall('data/')

re_tag = re.compile(r'<[^>]+>')


def rm_tags(text):
    return re_tag.sub('', text)


def read_files(filetype):
    path = "data/aclImdb/"
    file_list = []

    positive_path = path + filetype + "/pos/"
    for f in os.listdir(positive_path):
        file_list += [positive_path + f]

    negative_path = path + filetype + "/neg/"
    for f in os.listdir(negative_path):
        file_list += [negative_path + f]

    print('read', filetype, 'files:', len(file_list))

    all_labels = ([1] * 12500 + [0] * 12500)

    all_texts = []

    for fi in file_list:
        with open(fi, encoding='utf8') as file_input:
            all_texts += [rm_tags(" ".join(file_input.readlines()))]

    return all_labels, all_texts


# 读文件
y_train, train_text = read_files("train")
y_test, test_text = read_files("test")

# 建立单词和数字映射的字典
token = Tokenizer(num_words=3800)
token.fit_on_texts(train_text)

# 将影评的单词映射到数字
x_train_seq = token.texts_to_sequences(train_text)
x_test_seq = token.texts_to_sequences(test_text)

# 让所有影评保持在380个数字
x_train = sequence.pad_sequences(x_train_seq, maxlen=380)
x_test = sequence.pad_sequences(x_test_seq, maxlen=380)

model = Sequential()

model.add(Embedding(output_dim=32,
                    input_dim=3800,
                    input_length=380))
model.add(Dropout(0.35))

# 加了一个简单的RNN层
model.add(SimpleRNN(units=16))

model.add(Dense(units=256, activation='relu'))
model.add(Dropout(0.35))

model.add(Dense(units=1, activation='sigmoid'))

model.summary()
'''
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #
=================================================================
embedding_1 (Embedding)      (None, 380, 32)           121600
_________________________________________________________________
dropout_1 (Dropout)          (None, 380, 32)           0
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 16)                784
_________________________________________________________________
dense_1 (Dense)              (None, 256)               4352
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257
=================================================================
Total params: 126,993
Trainable params: 126,993
Non-trainable params: 0
_________________________________________________________________
'''

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

train_history = model.fit(x_train, y_train, batch_size=100,
                          epochs=10, verbose=2,
                          validation_split=0.2)


def show_train_history(train_history, train, validation):
    plt.plot(train_history.history[train])
    plt.plot(train_history.history[validation])
    plt.title('Train History')
    plt.ylabel(train)
    plt.xlabel('Epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.savefig('1.png')
    plt.show()


show_train_history(train_history, 'accuracy', 'val_accuracy')
show_train_history(train_history, 'loss', 'val_loss')

scores = model.evaluate(x_test, y_test, verbose=1)
print(scores[1])  #0.8435199856758118

probility = model.predict(x_test)
print(probility[:10])
'''
[[0.99944454]
 [0.9945456 ]
 [0.9609897 ]
 [0.9960341 ]
 [0.94589317]
 [0.99465704]
 [0.9959715 ]
 [0.77324533]
 [0.99609864]
 [0.99585694]]
'''

predict = model.predict_classes(x_test)
print(predict[:10])
'''
[[1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]]
'''

SentimentDict = {1: '正面的', 0: '负面的'}


def display_test_Sentiment(i):
    print(test_text[i])
    print('label:', SentimentDict[y_test[i]],
          '预测结果:', SentimentDict[predict[i][0]])

    '''
    label: 正面的 预测结果: 正面的
    负面的
    '''


display_test_Sentiment(2)


def predict_review(input_text):
    # 影评转换为数字列表
    input_seq = token.texts_to_sequences([input_text])
    # 截断数字列表使得所有输入长度为380
    pad_input_seq = sequence.pad_sequences(input_seq, maxlen=380)
    # 预测分类结果
    predict_result = model.predict_classes(pad_input_seq)
    # 输出结果
    print(SentimentDict[predict_result[0][0]])


predict_review('''
As a fan of the original Disney film (Personally I feel it’s their masterpiece) I was taken aback to the fact that a new version was in the making. Still excited I had high hopes for the film. Most of was shattered in the first 10 minutes. Campy acting with badly performed singing starts off a long journey holding hands with some of the worst CGI Hollywood have managed to but to screen in ages.
A film that is over 50% GCI, should focus on making that part believable, unfortunately for this film, it’s far from that. It looks like the original film was ripped apart frame by frame and the beautiful hand-painted drawings have been replaced with digital caricatures. Besides CGI that is bad, it’s mostly creepy. As the little teacup boy will give me nightmares for several nights to come. Emma Watson plays the same character as she always does, with very little acting effort and very little conviction as Belle. Although I can see why she was cast in the film based on merits, she is far from the right choice for the role. Dan Stevens does alright under as some motion captured dead-eyed Beast, but his performance feels flat as well. Luke Evans makes for a great pompous Gaston, but a character that has little depth doesn’t really make for a great viewing experience. Josh Gad is a great comic relief just like the original movie’s LeFou. Other than that, none of the cast stands out enough for me to remember them. Human or CHI creature. I was just bored through out the whole experience. And for a project costing $160 000 000, I can see why the PR department is pushing it so hard because they really need to get some cash back on this pile of wet stinky CGI-fur!
All and all, I might be bias from really loving Disney’s first adaptation. That for me marks the high-point of all their work, perfectly combining the skills of their animators along with some CGI in a majestic blend. This film however is more like the bucket you wash off your paintbrush in, it has all the same colors, but muddled with water and to thin to make a captivating story from. The film is quite frankly not worth your time, you would be better off watching the original one more time.
''')