资源简介
机器学习算法XGboost、LightGBM、Catboost的代码架构,满足基本的数据分析,回归、二分类、多分类。
代码片段和文件信息
import pandas as pd
import numpy as np
import scipy as sp
#文件读取f表示文件路径文件名
def red_csv_file(flogging = False):
print(“=================读 取 文 件===================“)
data = pd.read_csv(f)
if loggong:
print(data.head(5))
print(data.columns.values)
print(data.describe())
print(data.info())
return data
#通用的LogisticRegression框架
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandarScaler
#1.读入数据
df_train = pd.Dataframe()
df_test = pd.Dataframe()
y_train = df_train[‘label‘].vslues
#2.处理数据
ss = StandardScaler()
#3.特征处理/重编码
#3.1 对分类的变量
enc = OneHotEncoder()
feats = [“creativeID““adID““campaignID“]
for i feat in enumerate(feats):
x_train = enc.fit_transform(df_train[feat].values.reshape(-11))
x_test = enc.fit_transform(df_test[feat].values.reshape(-11))
if i == 0:
X_train X_test = x_train x_test
else:
X_train X_test = sparse.hstack((X_train x_train)) sparse.hstack((X_test x_test))
#3.2 对数值变量
#对于StandarScalar必须是而分类变量,否则reshape(-1 len(feats)) is required
feats = [“price“ “age“]
x_train = ss.fit_transform(df_train[feats].values)
x_test = ss.fit_transform(df_test[feats].values)
ss.fit_transform(df_test[feats].values)
X_train X_test = sparse.hstack((X_train x_train)) sparse.hstack((X_test x_test))
#模型训练
lr = LogisticRegression()
lr.fit(X_train y_train)
proba_test = lr.predict_predict_proba(X_test)[:1]
#LightGBM二分类
import lightgbm as lgb
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
print(“Loading Data ... “)
#导入数据
train_x train_y test_x = load_data()
# 用sklearn.cross_validation进行训练数据集划分,这里训练集和交叉验证集比例为7:3,可以自己根据需要设置
X val_X y val_y = train_test_split(
train_x
train_y
test_size=0.05
random_state=1
stratify=train_y ## 这里保证分割后y的比例分布与原数据一致
)
X_train = X
y_train = y
X_test = val_X
y_test = val_y
#创建LightGBM的数据集
lgb_train = lgb.Dataset(X_train y_train)
lgb_eval = lgb.Dataset(X_test y_test reference=lgb_train)
# specify your configurations as a dict
params = {
‘boosting_type‘: ‘gbdt‘
‘objective‘: ‘binary‘
‘metric‘: {‘binary_logloss‘ ‘auc‘}
‘num_leaves‘: 5
‘max_depth‘: 6
‘min_data_in_leaf‘: 450
‘learning_rate‘: 0.1
‘feature_fraction‘: 0.9
‘bagging_fraction‘: 0.95
‘bagging_freq‘: 5
‘lambda_l1‘: 1
‘lambda_l2‘: 0.001 # 越小l2正则程度越高
‘min_gain_to_split‘: 0.2
‘verbose‘: 5
‘is_unbalance‘: True
}
# train
print(‘Start training...‘)
gbm = lgb.train(params
lgb_train
num_boost_round=10000
valid_sets=lgb_eval
early_stopping_round
相关资源
- 二级考试python试题12套(包括选择题和
- pywin32_python3.6_64位
- python+ selenium教程
- PycURL(Windows7/Win32)Python2.7安装包 P
- 英文原版-Scientific Computing with Python
- 7.图像风格迁移 基于深度学习 pyt
- 基于Python的学生管理系统
- A Byte of Python(简明Python教程)(第
- Python实例174946
- Python 人脸识别
- Python 人事管理系统
- 基于python-flask的个人博客系统
- 计算机视觉应用开发流程
- python 调用sftp断点续传文件
- python socket游戏
- 基于Python爬虫爬取天气预报信息
- python函数编程和讲解
- Python开发的个人博客
- 基于python的三层神经网络模型搭建
- python实现自动操作windows应用
- python人脸识别(opencv)
- python 绘图(方形、线条、圆形)
- python疫情卡UN管控
- python 连连看小游戏源码
- 基于PyQt5的视频播放器设计
- 一个简单的python爬虫
- csv文件行列转换python实现代码
- Python操作Mysql教程手册
- Python Machine Learning Case Studies
- python获取硬件信息
评论
共有 条评论