1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
| import pandas as pd import numpy as np from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer from sklearn.linear_model import LogisticRegression import warnings warnings.filterwarnings("ignore")
# Read the dataset df_train = pd.read_csv('../data/train.csv') df_valid = pd.read_csv('../data/test.csv') df_test = pd.read_csv('../data/testB.csv') df_train = df_train.fillna('') df_valid.fillna('') df_test = df_test.fillna('')
# Features df_train['text'] = df_train.apply(lambda x:' '.join(x.drop('uuid').astype(str)),axis=1) df_valid['text'] = df_valid.apply(lambda x:' '.join(x.drop('uuid').astype(str)),axis=1) df_test['text'] = df_test.apply(lambda x:' '.join(x.drop('uuid').astype(str)),axis=1)
# df_train['text'] vector = TfidfVectorizer().fit(df_train['text'].tolist())
vocab = vector.vocabulary_ train_vector = vector.transform(df_train['text']) valid_vector = vector.transform(df_valid['text']) test_vector = vector.transform(df_test['text'])
df_trainv = pd.DataFrame(train_vector.toarray(),columns=vocab) df_validv = pd.DataFrame(valid_vector.toarray(),columns=vocab) df_testv = pd.DataFrame(test_vector.toarray(),columns=vocab)
df_train.describe()
# lightGBM model import lightgbm as lgb data = lgb.Dataset(df_trainv,df_train['label']) # data_val = lgb.Dataset(df_validv,df_valid['label']) params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'binary_logloss', 'num_leaves': 31, 'learning_rate': 0.05, 'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'verbose': 0 } model = lgb.train(params, data)
y_pred = model.predict(test_vector) y_pred = np.where(y_pred>=0.5,1,0) df_test['label'] = y_pred
df_test['Keywords'] = df_test['title'].fillna('') df_test[['uuid', 'Keywords', 'label']].to_csv('../data/task1_lightGBM.csv', index=None)
|