LiuFan
/
PrivacyScanData


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960
							from datasets import Datasets
import tensorflow as tf
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


def main():
    # 导入数据
    x, y = Datasets.load_movie_review()
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3)

    # 数据预处理，词袋
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(x)
    x_train = tokenizer.texts_to_sequences(x_train)
    x_test = tokenizer.texts_to_sequences(x_test)
    num_words = len(tokenizer.word_index)
    # 序列编码one-hot
    x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen=200)
    x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test, maxlen=200)
    y_train = tf.keras.utils.to_categorical(y_train, num_classes=2)
    y_test = tf.keras.utils.to_categorical(y_test, num_classes=2)

    # 顺序模型（层直接写在里面，省写add）
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(
            input_dim=num_words + 1,  # 字典长度 加1 不然会报错
            output_dim=300,  # 全连接嵌入的维度，常用256或300
            input_length=200,  # 当输入序列的长度固定时，该值为其长度
            trainable=True,  # 代表词向量作为参数进行更新
        ),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, implementation=2)),  # 双向LSTM
        tf.keras.layers.Dropout(0.5),  # 丢弃50%，防止过拟合
        tf.keras.layers.Dense(2, activation="softmax"),
    ])

    # 编译模型
    model.compile(
        optimizer="adam",  # 优化器
        loss="categorical_crossentropy",  # 损失函数
        metrics=["acc"],  # 观察值， acc正确率
    )

    # 训练
    history = model.fit(
        x_train, y_train,
        batch_size=32,  # 一次放入多少样本
        epochs=10,
        validation_data=(x_test, y_test),
    )
    # loss: 0.0013 - acc: 1.0000 - val_loss: 0.5421 - val_acc: 0.7350

    # 画图 正确率(是否过拟合)
    plt.plot(history.epoch, history.history.get("acc"))
    plt.plot(history.epoch, history.history.get("val_acc"))
    plt.show()


if __name__ == "__main__":
    main()