ML/Kaggle笔记#

记录ml/kaggle各种实际操作

不想引入过多的库，太杂乱了。尽量只用pandas/numpy/sklearn。

其他各种库sns/mlxtend/fuzzywuzzy

https://pandas.pydata.org/pandas-docs/stable/user_guide/index.html

https://scikit-learn.org/stable/user_guide.html

1.概念#

1.1 Dataframe/Series#

import pandas as pd  
import numpy as np    
  
df = pd.DataFrame({'Yes':[50, 21], 'No':[131, 2]})  

df = pd.Series([1, 2, 3, 4, 5])  

# 也可以有名字/index等各种信息  
df = pd.Series([30, 35, 40], index=['2015 Sales', '2016 Sales', '2017 Sales'], name='Product A')  

df['ggg']  
df.ggg  
df.iloc[2] # 取#2行  
df.iloc[:, 2] # 取#2列  
df.iloc[3:5, 2] # 取#3-5行 #2列  
df.iloc[[4, 5 ,9], 2] # 取#4#5#9行 #2列  
  
# iloc vs loc  
# https://stackoverflow.com/questions/31593201/how-are-iloc-and-loc-different  
  
# loc选取行列  
df.loc[[0,1,10,100], ['country', 'province', 'region_1', 'region_2']]  
  
# 条件选取  
print(df.loc[(df.MSZoning == 'RL') & (df.SaleCondition == 'Abnorml') | (df.SalePrice > 400000)])  
print(df.loc[(df.SaleType.isin(['COD']))])  
  
# 获取符合条件的index  
index = df.index[df.SaleType == 'New']  
print(df.index[(df.SaleType == 'New') & (df.SalePrice > 400000)])  
  
print(index)  
print(df.loc[index])  
  
# 直接改数据  
df.SalePrice = 666  
df.SalePrice = df.SalePrice * 10000000  
df.SalePrice.replace(208500, 666, inplace = True) # 替换  
  
# 查看某列的简单信息  
print(df.SalePrice.describe())  
print(df.SalePrice.mean())  
print(df.SalePrice.median())  
print(df['SaleType'].unique())  
print(df.SalePrice.value_counts()) # 每个唯一值的数量  

1.2 map/apply#

# 可处理数据。例如给数据转换格式。  
  
# 某一列的值除以10  
df.SalePrice = df.SalePrice.map(lambda p: p / 10)  
  
# 修改index之中的行数据  
index = df.index[df.SaleType == 'New']  
df.loc[index, 'SalePrice'] = df.loc[index, 'SalePrice'].map(lambda p: p * 10)  
  
# 用apply做一样的效果  
def deal_data(row):  
    row.SalePrice *= 100  
    return row  
  
df = df.apply(deal_data, axis = 1)  
  
# 排序。DataFrame.sort_values  
print(df.sort_values(by = 'SalePrice', ascending = False))  
  
# 多列依次排序  
print(df.sort_values(by = ['SalePrice', 'MSSubClass']))  

1.3 Grouping#

# Grouping  
  
# 按某列的值分组，再计算。  
# 默认会用指定的group作为新的index  
print(df.groupby('SaleType').SalePrice.sum())  
print(df.groupby('SaleType').SalePrice.min())  
print(df.groupby('SaleType').SalePrice.max())  
  
# 按1列分组  
# apply返回原始数据  
with pd.option_context('display.max_rows', None):  
    print(df.groupby(['SalePrice']).apply(lambda x: x))  
  
# 按2列分组  
print(df.groupby(['SaleType', 'SalePrice']).apply(lambda x: x))  
  
# 用函数看每条数据的内容  
# 数据是每个分组的所有数据，是个DataFrame，可包含多条数据。  
def app(data):  
    print(type(data))  
    print(data)  
    return data  
  
print(df.groupby(['SaleType', 'SalePrice']).apply(app))  
  
# 返回每组里某个列最大的的那个  
def app(data):  
    print(type(data))  
    print(data)  
    print(data.SalePrice)  
    idxmax = pd.to_numeric(data['MSSubClass']).idxmax()  
    print(f"idxmax {idxmax}")  
    return df.loc[idxmax]  
  
print(df.groupby(['SaleType', 'SalePrice']).apply(app))  
  
  
# agg对每组做多个计算并列出  
# 参数种函数的写法有待研究  
print(df.groupby(['SaleType', 'SalePrice']).MSSubClass.agg([len, 'min', 'max', 'mean']))  
  
# 对多列分组，并agg，并对结果再排序。  
df.groupby(['SaleType', 'SalePrice']).MSSubClass.agg([len, 'min', 'max', 'mean']).sort_values(by = ['len', 'mean'])  

1.4 数据简单预处理#

# 数据集  
# https://en.wikipedia.org/wiki/List_of_datasets_for_machine-learning_research  
# https://archive.ics.uci.edu  
# https://www.kaggle.com/datasets  
  
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'  
#url = 'https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction/download?datasetVersionNumber=1'  
  
# 读取csv  
# url。有的情况需要额外处理。  
df = pd.read_csv(url)  
print(df.head())  
  
# https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction?select=heart.csv  
# https://www.kaggle.com/competitions/home-data-for-ml-course/data?select=train.csv  
  
# 可直接读zip  
df = pd.read_csv('heart.zip')  
print(df.head())  
  
# csv  
df = pd.read_csv('train.csv')  
print(df.head())  
  
# 查看数据类型  
print(df.dtypes)  
  
# 缺失数据相关  
  
# https://pandas.pydata.org/docs/user_guide/missing_data.html  
# 查看null数据量  
# sum默认针对列  
missing = df.isnull().sum()  
print(missing)  
  
# 针对行  
print(df.isnull().sum(axis = 1))  
  
# 打印全部数据  
with pd.option_context('display.max_rows', None):  
    print(missing)  
  
# 打印部分  
print(missing[:10])  
  
# 再sum得到总数  
print(missing.sum())  
  
# 打印百分比  
print(f"missing {missing.sum()/np.product(df.shape):.2%}")  
  
# 获取某列有null的数据  
print(df[pd.isnull(df['XXX'])])  
  
# 丢弃包含na的行或列  
df.dropna(inplace = True)  
df.dropna(axis = 1, inplace = True)  
  
# 填充na  
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.fillna.html  
# 有多种填充策略  
df.fillna(0, inplace = True) # bfill ffill  
df.Age.fillna(df.Age.mean(), inplace = True) # 平均
df.Age.fillna(df.Age.mode()[0], inplace = True) # 最常出现
  
# 填入平均值的讨论  
# https://stackoverflow.com/questions/18689823/pandas-dataframe-replace-nan-values-with-average-of-columns  
  
# 用sklearn.impute.SimpleImputer填充  
# https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html  
# na填充为已存在值的平均值。mean等只能处理数字类型  
# fit_transform直接处理DataFrame，返回ndarray。  
# 可转回DataFrame。columns = df.columns保留原始的header。  
from sklearn.impute import SimpleImputer  
imp_mean = SimpleImputer(missing_values = np.nan, strategy='most_frequent') # most_frequent mean  
df = pd.DataFrame(imp_mean.fit_transform(df), columns = df.columns)  
  
  
  
# Scaling and Normalization  
# https://www.kaggle.com/code/alexisbcook/scaling-and-normalization/tutorial  
  
  
# Scaling  
  
from sklearn.preprocessing import MinMaxScaler, minmax_scale  
  
# 默认0-1。minmax_scale可直接处理某些列  
df[['MSSubClass', 'ggg']] = minmax_scale(df[['MSSubClass', 'ggg']])  
  
  
# =是引用。copy默认是deep copy。  
df2 = df  
df2 = df.copy()  
  
# 可获取两个df的diff  
print(df.compare(df2))  
print(df.equals(df2))  
# 不同的库和精度问题  
# from mlxtend.preprocessing import minmax_scaling  
# 这个minmax_scaling和sklearn的minmax_scale返回的结果可能不同。  
pd.set_option("display.precision", 28)  
  
# 显示默认精度时可能df.compare打印出来看上去一样但是df.equals是False。  
# 提高显示精度后可以看到确实有区别  
# 不同的库计算结果有精度的区别  
  
# Normalization  
# https://stackoverflow.com/questions/26414913/normalize-columns-of-a-dataframe  
  
from sklearn.preprocessing import StandardScaler  

# 数据转换  
  
# 数据转成datetime。添加新列。  
df['date_parsed'] = pd.to_datetime(df['date'], format="%m/%d/%y")  
  
# 统计某一列的字符串长度。有些情况下可以用来区分数据。  
data_lengths = df['LandSlope'].str.len()  
print(data_lengths.value_counts())  
  
# 转成某种类型  
df.SalePrice.astype(int)  
  
# 修改index和列名DataFrame.rename  
df.rename(index = {0:'firstEntry', 1:'secondEntry'})  
df.rename(columns = {'points': 'score'})

1.5 模型评估#

https://scikit-learn.org/stable/modules/model_evaluation.html

1.6 交叉验证#

https://scikit-learn.org/stable/modules/cross_validation.html

1.7 Mutual Information#

https://www.kaggle.com/code/ryanholbrook/mutual-information
https://scikit-learn.org/stable/modules/feature_selection.html

2 kaggle#

2.1 预测房价#

https://www.kaggle.com/learn/intro-to-machine-learning

# 官方例子

# 读数据
melbourne_data = pd.read_csv

# 丢弃空行
melbourne_data = melbourne_data.dropna(axis = 0)

# 选target
y = melbourne_data.Price

# 选feat
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]

# 随机森林
from sklearn.tree import DecisionTreeRegressor
melbourne_model = DecisionTreeRegressor(random_state = 1)
melbourne_model.fit(X, y)

# mse
from sklearn.metrics import mean_absolute_error
predicted_home_prices = melbourne_model.predict(X)
mean_absolute_error(y, predicted_home_prices)

# 读数据
X_train_org = pd.read_csv('train.csv')  
X_test_org = pd.read_csv('test.csv')

X_test_all = X_test_org  
X_train_all = X_train_org  

# 处理target
X_train_all.dropna(axis = 0, subset = ['SalePrice'], inplace = True)  
y_train_all = X_train_all['SalePrice']  
X_train_all.drop(['SalePrice'], axis = 1, inplace = True)

# 选一些有序列。质量/数量等。  
ord_1_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'GarageQual', 'GarageCond']

# 选one hot
onehot_cols = ['LandContour', 'Condition1', 'PavedDrive', 'Electrical', 'MasVnrType', 'Alley', 'MiscFeature', 'BldgType', 'GarageType', 'Heating', 'RoofMatl', 'LotConfig', 'Functional', 'SaleType', 'RoofStyle', 'LandSlope', 'HouseStyle', 'SaleCondition', 'Condition2', 'Foundation', 'LotShape', 'MSZoning']

# 除掉挑选的有序列
onehot_cols = list(set(onehot_cols) - set(ord_cols))

# 数值类型的
numerical_cols = [cname for cname in X_train_all.columns if X_train_all[cname].dtype in ['int64', 'float64']]

numerical_cols.remove('MoSold')

print(f"onehot_cols {onehot_cols}")
print(f"ord_cols {ord_cols}")
print(f"numerical_cols {numerical_cols}")

# 挑选的所有列。有序/onehot/数字
my_cols = onehot_cols + numerical_cols + ord_cols
print(f"my_cols.shape {len(my_cols)}")

# 分割数据
X_train, X_valid, y_train, y_valid = train_test_split(X_train_all, y_train_all, train_size = 0.8, random_state = 2)

X_train = X_train[my_cols].copy()
X_valid = X_valid[my_cols].copy()
X_test = X_test_all[my_cols].copy()

print(f"X_train {X_train}")
print(f"X_valid {X_valid}")
print(f"X_test {X_test}")

# 处理数字的pipeline
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy = 'mean')),
    ('scaler', StandardScaler())
])

# 处理onehot的pipeline
onehot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
])

# 处理顺序标签的pipeline
ord_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('ord', OrdinalEncoder(#categories = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA'],
                              handle_unknown = 'use_encoded_value',
                              unknown_value = -1)) # np.nan
])

# 组成ColumnTransformer。指定pipeline及其处理的列
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('onehot', onehot_transformer, onehot_cols),
        ('ord', ord_transformer, ord_cols)
    ])


# 起模型
model = RandomForestRegressor(n_estimators = 150, random_state = 662)

# 组成最终的pipeline
my_pipeline = Pipeline(steps = [('preprocessor', preprocessor),
                                ('model', model)])

# 训练
my_pipeline.fit(X_train, y_train)

# 预测
pred = my_pipeline.predict(X_test)

# 输出
output = pd.DataFrame({'Id': X_test_org['Id'], 'SalePrice': pred})
output.to_csv('out.csv', index=False)

后续

添加Mutual Information处理。折腾了半天已知的feature、OrdinalEncoder等，并没有很大提升。

造feature，例如OverallQual * GrLivArea。其mi值达到0.757572。结果有巨大提升。排名直接从4万左右到4千多。

2.1 泰坦尼克#

https://www.kaggle.com/competitions/titanic/overview

# Age有100多个missing，填充最常见的值。比丢弃有较多提升。

# sibsp和parch做一个总数，好像没用。

# 看cabin数据可发现有数据的明显生存率高。处理一下好像没用。

# pclass保留有序。不要做成one hot。

# 用nn。分数能到0.7799。排名3470/15286。
# 随机森林0.72488