相关文章推荐
绅士的茴香  ·  安装cplex到python_mob64ca ...·  2 周前    · 
个性的饼干  ·  实用的 Python 之 ...·  1 周前    · 
买醉的小熊猫  ·  The double-edged ...·  7 月前    · 
近视的橙子  ·  13.7 ...·  1 年前    · 
礼貌的凉面  ·  战神不败_抖抖音·  1 年前    · 
import numpy as np from matplotlib import pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split import statsmodels.api as sm from xgboost import XGBRegressor import warnings from causalml.inference.meta import LRSRegressor from causalml.inference.meta import XGBTRegressor, MLPTRegressor from causalml.inference.meta import BaseXRegressor, BaseRRegressor, BaseSRegressor, BaseTRegressor from causalml. match import NearestNeighborMatch, MatchOptimizer, create_table_one from causalml.propensity import ElasticNetPropensityModel from causalml.dataset import * from causalml.metrics import * # Generate synthetic data using mode 1 y, X, treatment, tau, b, e = synthetic_data(mode= 1 , n= 10000 , p= 8 , sigma= 1.0 )

s-learner、t-learner、x-learner、r-learner 预估

# Ready-to-use S-Learner using LinearRegression
learner_s = LRSRegressor()
# Calling the Base Learner class and feeding in XGB
learner_t = BaseTRegressor(learner=XGBRegressor())
# Calling the Base Learner class and feeding in XGB
learner_x = BaseXRegressor(XGBRegressor())
# Calling the Base Learner class and feeding in XGB
learner_r = BaseRRegressor(learner=XGBRegressor())
# 估计ATE(以t-learner为例,其他均相同) 
ate_t = learner_t.estimate_ate(X, treatment, y)
# 估计ITE(以t-learner为例,其他均相同)
# fit_predict先后执行fit -> predict两个过程
cate_t = learner_t.fit_predict(X=X, treatment=treatment, y=y)

特征重要性和指标评估

# 特征重要性排序
slearner.plot_importance(X=X, 
                         tau=cate_t, 
                         normalize=True, 
                         method='auto', 
                         features=feature_names)
# 增益曲线和计算auuc
df_preds = pd.DataFrame([cate_t.ravel(),
                         treatment.ravel(), 
                         y.ravel()], 
                        index=["T", "w", "y"]).T
plot_gain(df_preds, normalize=True)
print(auuc_score(df_preds))

二,psmatching

PSM 倾向性得分匹配法的Python代码实操

# 1. 安装
!pip install psmatching
# 2. 导入
import psmatching.match as psm
import pytest
import pandas as pd
import numpy as np
from psmatching.utilities import *
import statsmodels.api as sm
# 3. path及model设置
path = "./data/psm/psm_gxslsj_data.csv"
#model由干预项和其他类别标签组成,形式为"干预项~类别特征+列别特征。。。"
model = "PUSH ~ AGE + SEX + VIP_LEVEL + LASTDAY_BUY_DIFF + PREFER_TYPE + LOGTIME_PREFER + USE_COUPON_BEFORE + ACTIVE_LEVEL"
#想要几个匹配项,如k=3,那一个push=1的用户就会匹配三个push=0的近似用户
k = "3"
m = psm.PSMatch(path, model, k)
# 4. 倾向得分计算
df = pd.read_csv(path)
#将用户ID作为数据的新索引
df = df.set_index("ID")
print("\n计算倾向性匹配得分 ...", end = " ")
#利用逻辑回归框架计算倾向得分,即广义线性估计 + 二项式Binomial
glm_binom = sm.formula.glm(formula = model, data = df, family = sm.families.Binomial())
#拟合拟合给定family的广义线性模型
#https://www.w3cschool.cn/doc_statsmodels/statsmodels-generated-statsmodels-genmod-generalized_linear_model-glm-fit.html?lang=en
result = glm_binom.fit()
# 输出回归分析的摘要
# print(result.summary)
propensity_scores = result.fittedvalues
print("\n计算完成!")
#将倾向性匹配得分写入data
df["PROPENSITY"] = propensity_scores
# 5. 随机排序干预组,减少原始排序的影响
groups = df.PUSH
propensity = df.PROPENSITY
#把干预项替换成True和False
groups = groups == groups.unique()[1]
n = len(groups)
#计算True和False的数量
n1 = groups[groups==1].sum()
n2 = n-n1
g1, g2 = propensity[groups==1], propensity[groups==0]
#确保n2>n1,,少的匹配多的,否则交换下
if n1 > n2: # 分开干预与非干预,且确保n1<n2。
    n1, n2, g1, g2 = n2, n1, g2, g1
m_order = list(np.random.permutation(groups[groups==1].index))
# 6. 根据倾向性得分,进行匹配
matches = {}
k = int(k)
print("\n给每个干预组匹配 [" + str(k) + "] 个对照组 ... ", end = " ")
for m in m_order:
    # 计算所有倾向得分差异,这里用了最粗暴的绝对值
    # 将propensity[groups==1]分别拿出来,每一个都与所有的propensity[groups==0]相减
    dist = abs(g1[m]-g2)
    array = np.array(dist)
    #如果无放回地匹配,最后会出现要选取3个匹配对象,但是只有一个候选对照组的错误,故进行判断
    if k < len(array):
        # 在array里面选择K个最小的数字,并转换成列表
        k_smallest = np.partition(array, k)[:k].tolist()
        # 用卡尺做判断
        caliper = None
        if caliper:
            caliper = float(caliper)
            # 判断k_smallest是否在定义的卡尺范围
            keep_diffs = [i for i in k_smallest if i <= caliper]
            keep_ids = np.array(dist[dist.isin(keep_diffs)].index)
        else:
            # 如果不用标尺判断,那就直接上k_smallest了
            keep_ids = np.array(dist[dist.isin(k_smallest)].index)
        #  如果keep_ids比要匹配的数量多,那随机选择下,如要少,通过补NA配平数量
        if len(keep_ids) > k:
            matches[m] = list(np.random.choice(keep_ids, k, replace=False))
        elif len(keep_ids) < k:
            while len(matches[m]) <= k:
                matches[m].append("NA")
        else:
            matches[m] = keep_ids.tolist()
        # 判断 replace 是否放回
        replace = False
        if not replace:
            g2 = g2.drop(matches[m])
print("\n匹配完成!")
# 7. 将匹配完成的结果合并起来
matches = pd.DataFrame.from_dict(matches, orient="index")
matches = matches.reset_index()
column_names = {}
column_names["index"] = "干预组"
for i in range(k):
    column_names[i] = str("匹配对照组_" + str(i+1))
matches = matches.rename(columns = column_names)
matched_data = get_matched_data(matches, df)
# 8. 对匹配结果进行变量检验
variables = df.columns.tolist()[0:-2]
results = {}
print("开始评估匹配 ...")
#注意:将PUSH替换成自己的干预项
for var in variables:
        # 制作用于卡方检验的频率计数交叉表
    crosstable = pd.crosstab(df['PUSH'],df[var])
    if len(df[var].unique().tolist()) <= 2:
        # 计算 2x2 表的卡方统计量、df 和 p 值
        p_val = calc_chi2_2x2(crosstable)[1]
    else:
        # 计算 2x2 表的卡方统计量、df 和 p 值
        p_val = calc_chi2_2xC(crosstable)[1]
    results[var] = p_val
    print("\t" + var + '(' + str(p_val) + ')', end = "")
    if p_val < 0.05:
        print(": 未通过")
    else:
        print(": 通过")
if True in [i < 0.05 for i in results.values()]:
    print("\n变量未全部通过匹配")
else:
    print("\n变量全部通过匹配")

未完待续

  • 私信