1234567891011121314151617181920212223242526272829303132 |
- from sklearn.ensemble import RandomForestClassifier
- from sklearn.model_selection import cross_val_score
- from sklearn.feature_extraction.text import CountVectorizer
- import pydotplus
- from datasets import Datasets
- def main():
- # 加载ADFA-LD 数据
- x1, y1 = Datasets.load_adfa_normal()
- x2, y2 = Datasets.load_adfa_attack(r"Hydra_FTP_\d+/UAD-Hydra-FTP*")
- x = x1 + x2
- y = y1 + y2
- # 词袋特征
- cv = CountVectorizer()
- x = cv.fit_transform(x).toarray()
- # 随机森林 交叉验证
- """
- n_estimators: 多少颗树
- max_depth: 树的最大深度 如果值为None,那么会扩展节点,直到所有的叶子是纯净的,或者直到所有叶子包含少于min_sample_split的样本
- min_samples_split:分割内部节点所需要的最小样本数量
- random_state:random_state是随机数生成器使用的种子
- """
- dec = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
- scores = cross_val_score(dec, x, y, n_jobs=-1, cv=10)
- print(scores.mean()) # 0.9829090909090908
- if __name__ == "__main__":
- main()
|