1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768 |
- import tensorflow as tf
- import matplotlib.pyplot as plt
- from datasets import Datasets
- from sklearn.model_selection import train_test_split
- def main():
- # 导入数据
- x, y = Datasets.load_movie_review()
- x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3)
- # 数据预处理,词袋
- tokenizer = tf.keras.preprocessing.text.Tokenizer()
- tokenizer.fit_on_texts(x)
- x_train = tokenizer.texts_to_sequences(x_train)
- x_test = tokenizer.texts_to_sequences(x_test)
- num_words = len(tokenizer.word_index)
- # 序列编码one-hot
- x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen=200)
- x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test, maxlen=200)
- y_train = tf.keras.utils.to_categorical(y_train, num_classes=2)
- y_test = tf.keras.utils.to_categorical(y_test, num_classes=2)
- # 顺序模型(层直接写在里面,省写add)
- model = tf.keras.Sequential([
- tf.keras.layers.Embedding(
- input_dim=num_words + 1, # 字典长度 加1 不然会报错
- output_dim=300, # 全连接嵌入的维度,常用256或300
- input_length=200, # 当输入序列的长度固定时,该值为其长度
- trainable=True, # 代表词向量作为参数进行更新
- ),
- # 卷积层
- tf.keras.layers.Conv1D(
- filters=64, # 64个卷积核
- kernel_size=3, # 大小3
- padding='valid', # 卷积模式
- activation="relu", # 激活函数
- ),
- tf.keras.layers.MaxPool1D(pool_size=2), # 池化
- tf.keras.layers.Dropout(.25), # 丢弃25% 防止过拟合
- tf.keras.layers.Flatten(),
- tf.keras.layers.Dense(2, activation="softmax"),
- ])
- # 编译模型
- model.compile(
- optimizer="adam", # 优化器
- loss="categorical_crossentropy", # 损失函数
- metrics=["acc"], # 观察值, acc正确率
- )
- # 训练
- history = model.fit(
- x_train, y_train,
- batch_size=32, # 一次放入多少样本
- epochs=10,
- validation_data=(x_test, y_test),
- )
- # loss: 0.0013 - acc: 1.0000 - val_loss: 0.5421 - val_acc: 0.7350
- # 画图 正确率(是否过拟合)
- plt.plot(history.epoch, history.history.get("acc"))
- plt.plot(history.epoch, history.history.get("val_acc"))
- plt.show()
- if __name__ == "__main__":
- main()
|