test.py 1.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. # 预处理与特征提取
  2. # 数字型直接可以用,但要进行预处理
  3. import numpy as np
  4. X = np.array([
  5. [1., -1., 2.],
  6. [2., 0., 0.],
  7. [0., 1., -1.]]
  8. )
  9. from sklearn import preprocessing
  10. # 标准化预处理
  11. # x_scaled = preprocessing.scale(X)
  12. # print(x_scaled)
  13. scaled = preprocessing.StandardScaler()
  14. x_scaled = scaled.fit_transform(X)
  15. print(x_scaled)
  16. # 字符串,特征提取 本质上是做单词切分,不同的单词当作一个新的特征
  17. from sklearn.feature_extraction import DictVectorizer
  18. from sklearn.feature_extraction.text import CountVectorizer
  19. # measurements = [
  20. # {'city': 'Dubai', 'temperature': 33.},
  21. # {'city': 'London', 'temperature': 12.},
  22. # {'city': 'San Fransisco', 'temperature': 18.},
  23. # ]
  24. #
  25. # dict = DictVectorizer()
  26. # m = dict.fit_transform(measurements).toarray()
  27. # print(dict.get_feature_names())
  28. cv = CountVectorizer()
  29. data = cv.fit_transform(["life is short,i like python python", "life is too long,i dislike python"])
  30. print(cv.get_feature_names())
  31. print(data.toarray())
  32. from sklearn.datasets import load_iris
  33. from sklearn.tree import DecisionTreeClassifier, export_graphviz
  34. import pydotplus
  35. iris = load_iris()
  36. x = iris.data
  37. y = iris.target
  38. dec = DecisionTreeClassifier()
  39. dec.fit(x, y)
  40. tree_dot = export_graphviz(dec, out_file=None)
  41. print(dec.predict([[1, 2]]))