LiuFan
/
PrivacyScanData


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
							from datasets import Datasets
import tensorflow as tf
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


# 特征提取，使用词集将操作命令向量化，根据操作统计命令词集来判断
def get_feature(cmd, fdist):
    feature = []
    for block in cmd:
        v = [0] * len(fdist)
        for i in range(0, len(fdist)):
            if fdist[i] in block:
                v[i] += 1
        feature.append(v)
    return feature


def main():
    # 导入数据
    data, y, fdist = Datasets.load_Schonlau('User3')
    x = get_feature(data, fdist)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3)

    num_words = len(x)
    # 序列编码one-hot
    x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen=100)
    x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test, maxlen=100)
    y_train = tf.keras.utils.to_categorical(y_train, num_classes=2)
    y_test = tf.keras.utils.to_categorical(y_test, num_classes=2)

    # 顺序模型（层直接写在里面，省写add）
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(
            input_dim=num_words + 1,  # 字典长度 加1 不然会报错
            output_dim=128,
            input_length=100,  # 当输入序列的长度固定时，该值为其长度
        ),
        tf.keras.layers.LSTM(64),
        tf.keras.layers.Dense(2, activation="softmax"),
    ])

    # 编译模型
    model.compile(
        optimizer="adam",  # 优化器
        loss="categorical_crossentropy",  # 损失函数
        metrics=["acc"],  # 观察值， acc正确率
    )

    # 训练
    history = model.fit(
        x_train, y_train,
        batch_size=32,  # 一次放入多少样本
        epochs=10,
        validation_data=(x_test, y_test),
    )
    # loss: 0.2557 - acc: 0.9143 - val_loss: 0.1812 - val_acc: 0.9556

    # 画图 正确率(是否过拟合)
    plt.plot(history.epoch, history.history.get("acc"))
    plt.plot(history.epoch, history.history.get("val_acc"))
    plt.show()


if __name__ == "__main__":
    main()