12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758 |
- # 预处理与特征提取
- # 数字型直接可以用,但要进行预处理
- import numpy as np
- X = np.array([
- [1., -1., 2.],
- [2., 0., 0.],
- [0., 1., -1.]]
- )
- from sklearn import preprocessing
- # 标准化预处理
- # x_scaled = preprocessing.scale(X)
- # print(x_scaled)
- scaled = preprocessing.StandardScaler()
- x_scaled = scaled.fit_transform(X)
- print(x_scaled)
- # 字符串,特征提取 本质上是做单词切分,不同的单词当作一个新的特征
- from sklearn.feature_extraction import DictVectorizer
- from sklearn.feature_extraction.text import CountVectorizer
- # measurements = [
- # {'city': 'Dubai', 'temperature': 33.},
- # {'city': 'London', 'temperature': 12.},
- # {'city': 'San Fransisco', 'temperature': 18.},
- # ]
- #
- # dict = DictVectorizer()
- # m = dict.fit_transform(measurements).toarray()
- # print(dict.get_feature_names())
- cv = CountVectorizer()
- data = cv.fit_transform(["life is short,i like python python", "life is too long,i dislike python"])
- print(cv.get_feature_names())
- print(data.toarray())
- from sklearn.datasets import load_iris
- from sklearn.tree import DecisionTreeClassifier, export_graphviz
- import pydotplus
- iris = load_iris()
- x = iris.data
- y = iris.target
- dec = DecisionTreeClassifier()
- dec.fit(x, y)
- tree_dot = export_graphviz(dec, out_file=None)
- print(dec.predict([[1, 2]]))
|