实现one hot encode的两种方法:
- 利用pandas实现one hot encode:
# transform a given column into one hot. Use prefix to have multiple dummies>>> import pandas as pd>>> df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['b', 'a', 'c']})>>> # Get one hot encoding of columns B... >>> df A B0 a b1 b a2 c c>>> one_hot = pd.get_dummies(df['B'])>>> # Drop columns B as it is now encoded... >>> df = df.drop('B', axis=1)>>> # Join the encoded df... >>> df = df.join(one_hot)>>> df A a b c0 a 0 1 01 b 1 0 02 c 0 0 1
def one_hot(df, cols): """ @param df pandas DataFrame @param cols a list of columns to encode @return a DataFrame with one-hot encoding """ for each in cols: dummies = pd.get_dummies(df[each], prefix=each, drop_first=False) df = pd.concat([df, dummies], axis=1) return df
>>> from sklearn.preprocessing import OneHotEncoder>>> enc = OneHotEncoder()>>> enc.fit([[0, 0, 3], [1,1,0], [0,2,1], [1,0,2]])OneHotEncoder(categorical_features='all', dtype= , handle_unknown='error', n_values='auto', sparse=True)>>> enc.n_values_array([2, 3, 4])>>> enc.feature_indices_array([0, 2, 5, 9])>>> enc.transform([[0,1,1]])<1x9 sparse matrix of type ' ' with 3 stored elements in Compressed Sparse Row format>>>> enc.transform([[0,1,1]]).toarray()array([[ 1., 0., 0., 1., 0., 0., 1., 0., 0.]])
- 一个保存在全局的Label_Binarizer的demo:
from sklearn.preprocessing import LabelBinarizer label_binarizer = LabelBinarizer()label_binarizer.fit(all_your_labels_list) # need to be global or remembered to use it laterdef one_hot_encode(x): """ One hot encode a list of sample labels. Return a one-hot encoded vector for each label. : x: List of sample Labels : return: Numpy array of one-hot encoded labels """ return label_binarizer.transform(x)