0. 优秀轮子分享
0. 优秀轮子分享1. 常用特征工程1.1 数据处理包 - Pandas1.2 常用特征方法1.2.1 标准化/归一化/缩放1.2.2 降采样/过采样2. 复杂特征工程2.1 样本增广 - KNN2.2 数据分箱 - One Hot
1. 常用特征工程
1.1 数据处理包 - Pandas
# 读取文件
pd.read_csv("train.csv", header=None, index_col=None)
pd.read_excel("train.csv", header=None, index_col=None)
df.set_index(column_name, drop=True, inplace=True)
df.sort_values(by=, axis=, ascending=, inplace=, kind=, ignore_index=)
df.sort_index(by=, axis)
pd.concat([data1, data2], axis=, )
df.add(other=, axis=, level=, fill_value=)
df.to_numpy()
df.as_type(dtype=, copy=True)
df.copy(deep=True)
df.loc[:, :]
df.insert(loc=, column=, value=, allow_duplicates) # 插入列
df.isin(values) # 查找值所在位置
df.apply(func, axis=, raw=, result_type=)
df.transform(func, axis=)
df.groupby(by=, axis=, level=, dropna=, as_index, group_keys=) # level一般针对多级索引存在的情况
df.count(axis=, level=, numeric_only=) # 分类别统计
df.cov(min_periods=, ddof=) # min_periods 最小间隔?
df.diff(periods=, axis=) # 与前几个的行数据对比
df.eval(expr=) # 执行表达式
df.round(decimals=)
df.values_count(subset=, normalize=, sort=, dropna=, ascending=)
df.drop(labels=, axis=, index-, columns=, level=)
df.drop_duplicates(subset=, keep=, inplace, ignore_index)
df.equals(other=)
df.reset_index(level=, drop=, inplace=)
df.dropna(axis=, how=, thresh=) # thresh表示非空的数量,达到这个值的不删
df.rank(axis=, method=, numeric_only=, na_option=, ascending=, pct=) # method 表示遇到了同值的该如何排列,pct表示是否需要显示百分比
df.rename(index={}, columns={})
df.isna()
df.append(other=, ignore_index=, verify_integrity=)
df.merge(right, how=, on=, left_on=, right_on=, left_index, right_index) # 类似于SQL的Join操作
df.join(other, on=, how=) # 更加简洁,建议使用这个
1.2 常用特征方法
1.2.1 标准化/归一化/缩放
标准化:将样本以估计的均值与方差缩放到标准正态分布上;
归一化:使用样本特征的最大最小值,将值映射到[0, 1]上;
值缩放:原有数据分布为[-N, N]的时候,可以缩放为[-1, 1]上;
# 特征缩放
scaler = preprocessing.StandardScaler().fit(X_train)
scaler.transform(X_train)
scaler = preprocessing.MinMaxScaler()
X_train_minmax = scaler.fit_transform()
scaler = preprocessing.MaxAbsScaler()
1.2.2 降采样/过采样
降采样与过采样,通常是以比例或者数据阈值来对类别不平衡的数据进行采样的。比如当二分类数据的数量比是5:1时,可以针对少类进行过采样,也可以对多类进行降采样,区别在于样本量多的时候,降采样的效果显然要好,样本量小的时候使用线性插值来增加样本数据比较好。这里介绍采样的方式来实现类别平衡。
# 两个类别的采样
def two_calsses_sample(source_data, label, method, nu=100000):
"""
source_data: The trainning data, dataframe.
label: The column name of label, str.
method: The smaple method, "up" or "down", str.
nu: If the length of data is larger than nu, method will not copy the data,
int or long.
"""
data = source_data
index = source_data.index
if len(source_data) < nu:
from copy import deepcopy
data = deepcopy(source_data)
data0 = data[data[label] == data.loc[index[0], label]]
data1 = data[data[label] != data.loc[index[0], label]]
if method == "up":
while data0.shape[0] < data1.shape[0]:
data0 = data0.append(data0.sample(
n = min(int(0.8 * data0.shape[0]), data1.shape[0] - data0.shape[0]))
)
while data1.shape[0] < data0.shape[0]:
data1 = data1.append(data1.sample(
n = min(int(0.8 * data1.shape[0]), data0.shape[0] - data1.shape[0]))
)
return data0.append(data1)
else:
if data0.shape[0] < data1.shape[0]:
return data0.append(data1.sample(n=data0.shape[0]))
else:
return data1.append(data0.sample(n=data1.shape[0]))
# 多个类别的采样
def multi_sample(source_data, label, sigma=0.8, method="UP", nu=100000):
"""
source_data: The trainning data, dataframe.
label: The column name of label, str.
method: The smaple method, "up" or "down", str.
nu: If the length of data is larger than nu, method will not copy the data,
int or long.
"""
data = source_data
index = source_data.index
if len(source_data) < nu:
from copy import deepcopy
data = deepcopy(source_data)
labels = data[label].value_counts().index
df_list = [data[data[label] == x] for x in labels]
min_max = [min(data[label].value_counts().values), max(data[label].value_counts().values)]
if method == "UP":
for i in range(len(df_list)):
while df_list[i].shape[0] < min_max[1]:
df_list[i] = df_list[i].append(
df_list[i].sample(
n = min(
int(df_list[i].shape[0] * sigma + 1),
min_max[1] - df_list[i].shape[0]
)
)
)
else:
df_list = [x.sample(n=min_max[0]) for x in df_list]
return pd.concat(df_list, axis=0)
2. 复杂特征工程
2.1 样本增广 - KNN
有一种机器学习算法叫做K-近邻,这是用来聚类的一种算法,但因为可以计算样本之间的距离,所以也可以当作样本生成的一种方式。那么KNN是怎么计算的呢?K近邻算法,即是给定一个训练数据集,对新的输入实例,在训练数据集中找到与该实例最邻近的K个实例,这K个实例的多数属于某个类,就把该输入实例分类到这个类中。
2.2 数据分箱 - One Hot
信息增益 - ID3
信息增益是信息熵与条件熵的插值,信息熵的计算方法为:
条件熵相比信息熵,其计算方式为:
所以计算得到的信息增益,就是根据某个特征划分数据集之后,混乱程度的减少。
计算所有可供挑选的特征划分之后,选择可以使得信息增益最大的特征作为分叉特征。
基尼指数 - CART
决策树通过计算 指数,基于最小的 Gini 指数选择特征,生成二叉树。Gini 指数的计算方式为:
在选择特征 X 之后,可以得到选定特征 X 之后的 Gini 指数:
信息增益率(信息增益比)- C4.5
信息增益率,适用于解决信息增益使得属性的选择取值较多的问题,信息增益率为信息增益与该特征的信息熵之比,如下:
分箱算法
分类的分箱算法代码如下,首先是计算熵/Gini/信息增益率:
def cal_entro_gini_rate(source_data, label, split_points, column, method="entro"):
"""
source_data:source_data: The source, dataframe;
label: The label of data, str;
split_points: The split point, list;
column: The split column, str;
method: The method of way to find the best point, {'entro', 'gini', 'entro_rate'}, str;
"""
import numpy as np
data = source_data
labels = list(data[label].value_counts().index)
if data.shape[0] < 10000:
from copy import deepcopy
data = deepcopy(data)
# 1. Calculate the entropy or gini of (n-1)
entro = 0
for point in split_points:
# if the column's type is 'object'
if data[column].dtypes == 'object': slice_data = data[data[column] == point]
else: slice_data = data[data[column] <= point]
# if the length of dataset is zero
if slice_data.shape[0] == 0: continue
p_x = slice_data.shape[0] / data.shape[0]
p_y = 0
# add the enrto
for c_value in labels:
pi = slice_data[slice_data[label] == c_value].shape[0] / slice_data.shape[0]
if pi == 0 or pi == 1:
continue
else:
if method == 'entro' or 'entro_rate': p_y += pi * np.log(pi)
if method == 'gini': p_y += pi * (1 - pi)
entro += p_y * p_x
# 2. Calculate the last part
if len(split_points) == 0:
p_x = 1
p_y = 0
# add the enrto
for c_value in labels:
pi = data[data[label] == c_value].shape[0] / data.shape[0]
if pi == 0 or pi == 1:
continue
else:
if method == 'entro' or 'entro_rate': p_y += pi * np.log(pi)
if method == 'gini': p_y += pi * (1 - pi)
entro += p_y * p_x
else:
if data[column].dtypes == 'object': slice_data = data[data[column] == split_points[-1]]
else: slice_data = data[data[column] > split_points[-1]]
p_x = slice_data.shape[0] / data.shape[0]
p_y = 0
# add the enrto
for c_value in labels:
# if the length of dataset is zero
if slice_data.shape[0] == 0: break
pi = slice_data[slice_data[label] == c_value].shape[0] / slice_data.shape[0]
if pi == 0 or pi == 1:
continue
else:
if method == 'entro' or 'entro_rate': p_y += pi * np.log(pi)
if method == 'gini': p_y += pi * (1 - pi)
entro += p_y * p_x
# Returen the result
return -entro
其次是分箱分割点代码:
def bin_split_entro(source_data, label, ID, method='entro', min_entro=1e-5, max_num=10, nu=100000):
"""
source_data: The source, dataframe;
method: The method of way to find the best point, {'entro', 'gini', 'entro_rate'}, str;
label: The label of data, str;
ID: The ID of data, list or str;
min_entro: The minmum spilt entropy of column, float;
min_num: The minium spilt binaries, integer;
nu: If the length of data is larger than nu, method will not copy the data,
int or long.
"""
# if the params assert exception
assert method in ['entro', 'gini', 'entro_rate']
import numpy as np
# primary parameters
data = source_data
split_dict = {}
# copy data
if data.shape[0] < nu:
from copy import deepcopy
data = deepcopy(source_data)
# make the loop
for column in data.columns:
# some special conditions
if isinstance(ID, str) and ID == column:
continue
elif column in ID or data[column].dtypes == 'object' or column == label:
continue
# general condition
else:
# data ready
column_data = data.sort_values(by=column, axis=0).reset_index(drop=True)
column_data = column_data.dropna(axis=0, subset=[column])
split_dict[column] = []
# loop to find the best split point
while True:
# record the gain list
max_gain = 0
point = None
# The primary params:
p_entro = cal_entro_gini_rate(column_data, label, split_dict[column], column, method)
# Find the best gain
for i in column_data.index:
gain = 0
if column_data.loc[i, column] in split_dict[column]:
continue
else:
# params ready
split_dict[column].append(column_data.loc[i, column])
s_entro = cal_entro_gini_rate(column_data, label, split_dict[column], column, method)
# method judge
if method == 'entro_rate':
gain = (p_entro - s_entro)
x_entro = cal_entro_gini_rate(column_data, column, split_dict[column], column, method)
gain /= x_entro
elif method == 'entro' or method == 'gini': gain = p_entro - s_entro
# gain judge
if gain > max_gain:
point = column_data.loc[i, column]
max_gain = gain
split_dict[column].pop()
# Break condition
if max_gain < min_entro or point is None or len(split_dict[column]) >= max_num:
break
else:
print(f"{column} point: {point} \nmax gain: {max_gain}")
split_dict[column].append(point)
# return the result
return split_dict
最后是调用:
bin_split_entro(data, label="Label", ID="ID", method='entro')
AGE point: 49
max gain: 0.002119389403307026
MONTHLY_INCOME_WHITHOUT_TAX point: 9927.0
max gain: 0.004442633244405747
GAGE_TOTLE_PRICE point: 646408.0
max gain: 0.09319132161711585
APPLY_AMOUNT point: 780000.0
max gain: 0.004370596683120231
APPLY_TERM_TIME point: 60
max gain: 0.0053769809107147015
APPLY_INTEREST_RATE point: 5.346
max gain: 0.3256408841552193
$ {'AGE': [49],
'MONTHLY_INCOME_WHITHOUT_TAX': [9927.0],
'GAGE_TOTLE_PRICE': [646408.0],
'APPLY_AMOUNT': [780000.0],
'APPLY_TERM_TIME': [60],
'APPLY_INTEREST_RATE': [5.346]}