import pandas as pd
from sklearn import model_selection
data = pd.read_csv("input/pn_same_judge_preprocessed.csv")
train, test = model_selection.train_test_split(data, test_size=0.1, random_state=0)
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import PrecisionRecallDisplay
pipe_svm = Pipeline([
("vect", TfidfVectorizer(tokenizer=str.split)),
("clf", SGDClassifier(random_state=0)),
Pipeline(steps=[('vect',
TfidfVectorizer(tokenizer=<method 'split' of 'str' objects>)),
('clf', SGDClassifier(random_state=0))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
score_svm = pipe_svm.decision_function(test["tokens"])
PrecisionRecallDisplay.from_predictions(
y_true=test["label_num"],
y_pred=score_svm,
name="Online SVM",
pipe_svm = Pipeline([
("vect", TfidfVectorizer(tokenizer=str.split)),
("clf", SGDClassifier(loss="hinge", penalty="l2", alpha=1e-3, random_state=42, max_iter=5, tol=None)),
Pipeline(steps=[('vect',
TfidfVectorizer(tokenizer=<method 'split' of 'str' objects>)),
('clf',
SGDClassifier(alpha=0.001, max_iter=5, random_state=42,
tol=None))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
score_svm = pipe_svm.decision_function(test["tokens"])
PrecisionRecallDisplay.from_predictions(
y_true=test["label_num"],
y_pred=score_svm,
name="Online SVM",
SGDClassifierで loss を log_loss にすることで、以下の目的関数を最適化するロジスティック回帰モデルに対応します。
(2)\[\begin{align}
L(y_i, f(x_i)) &= \log(1 + \exp(- y_i f(x_i)) \\
R(w) &= ||w||_2^2
\end{align}\]
学習してみましょう。
pipe_log = Pipeline([
("vect", TfidfVectorizer(tokenizer=str.split)),
("clf", SGDClassifier(loss="log_loss", random_state=0)),
Pipeline(steps=[('vect',
TfidfVectorizer(tokenizer=<method 'split' of 'str' objects>)),
('clf', SGDClassifier(loss='log_loss', random_state=0))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
score_log = pipe_log.predict_proba(test["tokens"])[:,1]
PrecisionRecallDisplay.from_predictions(
y_true=test["label_num"],
y_pred=score_log,
name="Logistic regression",