[关闭]
@zqbinggong 2018-04-16T16:43:58.000000Z 字数 35973 阅读 897

sklearn api

  1. #coding:utf-8
  2. import pandas as pd
  3. import time
  4. import numpy as np
  5. import warnings
  6. import numpy as np
  7. np.random.seed(2018)
  8. warnings.filterwarnings("ignore")
  9. # 时间处理
  10. def time2cov(time_):
  11. '''
  12. 时间是根据天数推移,所以日期为脱敏,但是时间本身不脱敏
  13. :param time_:
  14. :return:
  15. '''
  16. return time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time_))
  17. print('train')
  18. train = pd.read_csv('../data/round1_ijcai_18_train_20180301.txt',sep=" ")
  19. train = train.drop_duplicates(['instance_id'])
  20. train = train.reset_index(drop=True)
  21. print('test')
  22. test_a = pd.read_csv('../data/round1_ijcai_18_test_a_20180301.txt',sep=" ")
  23. all_data = pd.concat([train,test_a])
  24. all_data['real_time'] = pd.to_datetime(all_data['context_timestamp'].apply(time2cov))
  25. all_data['real_hour'] = all_data['real_time'].dt.hour
  26. all_data['real_day'] = all_data['real_time'].dt.day
  27. def time_change(hour):
  28. hour = hour - 1
  29. if hour == -1:
  30. hour = 23
  31. return hour
  32. def time_change_1(hour):
  33. hour = hour + 1
  34. if hour == 24:
  35. hour = 0
  36. return hour
  37. all_data['hour_before'] = all_data['real_hour'].apply(time_change)
  38. all_data['hour_after'] = all_data['real_hour'].apply(time_change_1)
  39. # 18 21 19 20 22 23 24 | 25
  40. print(all_data['real_day'].unique())
  41. # train and test cov radio
  42. # print(len((set(train['user_id']))&(set(test_a['user_id'])))/len(set(test_a['user_id'])))
  43. # print(len((set(train['shop_id']))&(set(test_a['shop_id'])))/len(set(test_a['shop_id'])))
  44. # print(len((set(train['item_id']))&(set(test_a['item_id'])))/len(set(test_a['item_id'])))
  45. # user 0.26714801444043323
  46. # shop 0.9781637717121588
  47. # item 0.956427604871448
  48. # shop feat
  49. # item feat
  50. # user feat
  51. def c_log_loss(y_t,y_p):
  52. tmp = np.array(y_t) * np.log(np.array(y_p)) + (1 - np.array(y_t)) * np.log(1 - np.array(y_p))
  53. return -np.sum(tmp)/len(y_t),False
  54. # 获取当前时间之前的前x天的转化率特征
  55. def get_before_cov_radio(all_data,label_data,cov_list = list(['shop_id','item_id','real_hour','item_pv_level','item_sales_level']),day_list = list([1,2,3])):
  56. result = []
  57. r = pd.DataFrame()
  58. label_data_time = label_data['real_day'].min()
  59. label_data_time_set = label_data['real_day'].unique()
  60. print('label set day',label_data_time_set)
  61. for cov in cov_list:
  62. for d in day_list:
  63. feat_set = all_data[
  64. (all_data['real_day']>=label_data_time-d)&(all_data['real_day']<label_data_time)
  65. ]
  66. print("cov feature",feat_set['real_day'].unique())
  67. print("cov time",cov)
  68. tmp = feat_set.groupby([cov],as_index=False).is_trade.agg({'mean':np.mean,'count':'count'}).add_suffix("_%s_before_%d_day"%(cov,d))
  69. tmp.rename(columns={'%s_%s_before_%d_day'%(cov,cov,d):cov},inplace=True)
  70. if d == 1:
  71. r = tmp
  72. else:
  73. r = pd.merge(r,tmp,on=[cov],how='outer').fillna(0)
  74. result.append(r)
  75. return result
  76. def calc_categry_feat(data):
  77. data['item_category_list_1'] = data['item_category_list'].apply(lambda x: int(x.split(';')[0]))
  78. data['item_category_list_2'] = data['item_category_list'].apply(lambda x: int(x.split(';')[1]))
  79. data['item_property_list_0'] = data['item_property_list'].apply(lambda x: int(x.split(';')[0]))
  80. data['item_property_list_1'] = data['item_property_list'].apply(lambda x: int(x.split(';')[1]))
  81. data['item_property_list_2'] = data['item_property_list'].apply(lambda x: int(x.split(';')[2]))
  82. for i in range(3):
  83. data['predict_category_%d' % (i)] = data['predict_category_property'].apply(
  84. lambda x: int(str(x.split(";")[i]).split(":")[0]) if len(x.split(";")) > i else -1
  85. )
  86. for item_cate in ['item_category_list_1','item_category_list_2']:
  87. for pre_item_cate in ['predict_category_0','predict_category_1','predict_category_2']:
  88. data['%s_%s'%(item_cate,pre_item_cate)] = data[item_cate] == data[pre_item_cate]
  89. data['%s_%s'%(item_cate,pre_item_cate)] = data['%s_%s'%(item_cate,pre_item_cate)].astype(int)
  90. del data['item_category_list']
  91. del data['item_property_list']
  92. del data['predict_category_property']
  93. return data
  94. take_columns = ['instance_id','item_id','shop_id','user_id','is_trade']
  95. shop_current_col = [
  96. 'shop_score_description','shop_score_delivery','shop_score_service',
  97. 'shop_star_level','shop_review_positive_rate','shop_review_num_level'
  98. ]
  99. user_col = [
  100. 'user_gender_id','user_age_level','user_occupation_id','user_star_level'
  101. ]
  102. item_col = [
  103. 'item_brand_id','item_city_id','item_price_level',
  104. 'item_sales_level','item_collected_level','item_pv_level',
  105. 'item_category_list','item_property_list'
  106. ]
  107. time_feat = ['real_hour','hour_before','hour_after','context_timestamp','real_day']
  108. context_col = ['predict_category_property','context_page_id']
  109. feat = take_columns + shop_current_col + time_feat + user_col + item_col + context_col
  110. def get_history_user_feat(all_data,data):
  111. label_data_time = data['real_day'].min()
  112. print(label_data_time)
  113. tmp = all_data[all_data['real_day'] < label_data_time]
  114. print(tmp['real_day'].unique())
  115. user_time = tmp.groupby(['user_id'],as_index=False).context_timestamp.agg({'day_begin':'min','day_end':'max'})
  116. user_time['alive'] = user_time['day_end'] - user_time['day_begin']
  117. user_time['s_alive'] = label_data_time - user_time['day_begin']
  118. user_time['alive/s_alive'] = user_time['alive'] / user_time['s_alive']
  119. user_time_cov = tmp[tmp['is_trade']==1]
  120. user_time_cov = user_time_cov.groupby(['user_id'], as_index=False).context_timestamp.agg({'day_end_cov': 'max'})
  121. user_time_cov = pd.DataFrame(user_time_cov).drop_duplicates(['user_id','day_end_cov'])
  122. data = pd.merge(data,user_time[['user_id','alive','s_alive','alive/s_alive','day_begin','day_end']],on=['user_id'],how='left')
  123. data = pd.merge(data,user_time_cov,on=['user_id'],how='left')
  124. data['day_end_cov'] = data['day_end_cov'].fillna(data['day_end'])
  125. data['alive_cov'] = data['day_end_cov'] - data['day_begin']
  126. data['alive/alive_cov'] = data['alive'] / data['alive_cov']
  127. # data['s_alive/alive_cov'] = data['s_alive'] / data['alive_cov']
  128. del data['day_end_cov']
  129. del data['day_end']
  130. del data['day_begin']
  131. # for i in [1,2,3]:
  132. # tmp = all_data[(all_data['real_day'] < data['real_day'].min()) & (all_data['real_day'] >= data['real_day'].min() - i)]
  133. # user_item_sales_level_day = tmp.groupby(['user_id'], as_index=False)['item_sales_level'] \
  134. # .agg({'user_item_sales_level_day_mean': 'mean',
  135. # 'user_item_sales_level_day_median': 'median',
  136. # 'user_item_sales_level_day_min': 'min',
  137. # 'user_item_sales_level_day_max': 'max',
  138. # 'user_item_sales_level_day_std': 'std',
  139. # 'user_item_sales_level_day_count': 'count'})
  140. # data = pd.merge(data, user_item_sales_level_day, 'left', on=['user_id'])
  141. # data = data[['user_id','alive','s_alive','alive/s_alive','alive_cov','alive/alive_cov']]
  142. return data.fillna(-1)
  143. def get_history_shop_feat(all_data,data):
  144. label_data_time = data['real_day'].min()
  145. print(label_data_time)
  146. for i in [1,2,3]:
  147. tmp = all_data[(all_data['real_day'] < label_data_time)&(all_data['real_day'] >= label_data_time - i)]
  148. shop_score_service_hour = tmp.groupby(['real_hour'], as_index=False)[
  149. 'shop_score_service'] \
  150. .agg({
  151. 'shop_score_service_hour_std_%d'%(i): 'std',
  152. })
  153. data = pd.merge(data, shop_score_service_hour, 'left', on=['real_hour'])
  154. shop_score_delivery = tmp.groupby(['real_hour'], as_index=False)[
  155. 'shop_score_delivery'] \
  156. .agg({
  157. 'shop_score_delivery_hour_std_%d' % (i): 'std',
  158. })
  159. data = pd.merge(data, shop_score_delivery, 'left', on=['real_hour'])
  160. shop_score_service_hour = tmp.groupby(['real_hour'], as_index=False)[
  161. 'shop_score_description'] \
  162. .agg({
  163. 'shop_score_description_hour_std_%d' % (i): 'std',
  164. })
  165. data = pd.merge(data, shop_score_service_hour, 'left', on=['real_hour'])
  166. shop_review_positive_rate = tmp.groupby(['real_hour'], as_index=False)[
  167. 'shop_review_positive_rate'] \
  168. .agg({
  169. 'shop_review_positive_rate_hour_std_%d' % (i): 'std',
  170. })
  171. data = pd.merge(data, shop_review_positive_rate, 'left', on=['real_hour'])
  172. shop_star_level = tmp.groupby(['real_hour'], as_index=False)[
  173. 'shop_star_level'] \
  174. .agg({
  175. 'shop_star_level_hour_std_%d' % (i): 'std',
  176. })
  177. data = pd.merge(data, shop_star_level, 'left', on=['real_hour'])
  178. shop_review_num_level = tmp.groupby(['real_hour'], as_index=False)[
  179. 'shop_review_num_level'] \
  180. .agg({
  181. 'shop_review_num_level_hour_std_%d' % (i): 'std',
  182. })
  183. data = pd.merge(data, shop_review_num_level, 'left', on=['real_hour'])
  184. shop_query_day_hour = tmp.groupby(['shop_id', 'real_hour']).size().reset_index().rename(
  185. columns={0: 'shop_query_day_hour_%d'%(i)})
  186. data = pd.merge(data, shop_query_day_hour, 'left', on=['shop_id', 'real_hour'])
  187. return data
  188. def get_history_item_feat(all_data,data):
  189. for i in [1, 2, 3]:
  190. tmp = all_data[(all_data['real_day']<data['real_day'].min())&(all_data['real_day']>=data['real_day'].min()-i)]
  191. item_brand_id_day = tmp.groupby(['item_city_id','real_hour']).size().reset_index().rename(
  192. columns={0: 'item_brand_id_day_%d'%(i)})
  193. data = pd.merge(data, item_brand_id_day, 'left', on=['item_city_id','real_hour'])
  194. item_brand_id_hour = tmp.groupby(['item_brand_id', 'real_hour']).size().reset_index().rename(
  195. columns={0: 'item_brand_id_hour_%d'%(i)})
  196. data = pd.merge(data, item_brand_id_hour, 'left', on=['item_brand_id', 'real_hour'])
  197. item_pv_level_hour = tmp.groupby(['item_pv_level', 'real_hour']).size().reset_index().rename(
  198. columns={0: 'item_pv_level_hour_%d'%(i)})
  199. data = pd.merge(data, item_pv_level_hour, 'left', on=['item_pv_level','real_hour'])
  200. #
  201. # item_pv_level_day = data.groupby(['real_day','real_hour'], as_index=False)['item_pv_level'] \
  202. # .agg({'item_pv_level_day_mean_%d'%(i): 'mean',
  203. # 'item_pv_level_day_median_%d'%(i): 'median',
  204. # 'item_pv_level_day_std_%d'%(i): 'std'
  205. # })
  206. # data = pd.merge(data, item_pv_level_day, 'left', on=['real_day','real_hour'])
  207. return data
  208. print('make feat')
  209. def make_feat(data,feat):
  210. '''
  211. :param data: 标签数据,当前时刻的用户特征
  212. :param feat: 特征数据,统计的用户特征
  213. :return: 拼接后的特征
  214. '''
  215. data = calc_categry_feat(data)
  216. data = get_history_user_feat(all_data,data)
  217. data = get_history_shop_feat(all_data,data)
  218. data = get_history_item_feat(all_data,data)
  219. for f in feat:
  220. data = pd.merge(data,f,on=[f.columns[0]],how='left')
  221. return data.fillna(0)
  222. test_a = all_data[train.shape[0]:]
  223. train = all_data[:train.shape[0]]
  224. val_a = train[train['real_day']==24]
  225. train_a = train[train['real_day']==23]
  226. train_b = train[train['real_day']==22]
  227. train_c = train[train['real_day']==21]
  228. # 传入全部数据和当前标签数据
  229. test_cov_feat = get_before_cov_radio(all_data,test_a)
  230. val_cov_feat = get_before_cov_radio(all_data,val_a)
  231. train_cov_feat_a = get_before_cov_radio(all_data,train_a)
  232. train_cov_feat_b = get_before_cov_radio(all_data,train_b)
  233. train_cov_feat_c = get_before_cov_radio(all_data,train_c)
  234. train_a = make_feat(train_a[feat],train_cov_feat_a)
  235. train_b = make_feat(train_b[feat],train_cov_feat_b)
  236. train_c = make_feat(train_c[feat],train_cov_feat_c)
  237. test_a = make_feat(test_a[feat],test_cov_feat)
  238. val_a = make_feat(val_a[feat],val_cov_feat)
  239. train = pd.concat([train_a,train_b])
  240. train = pd.concat([train,train_c])
  241. # print(train.shape)
  242. # train = pd.concat([train,val_a])
  243. # print(train.shape)
  244. y_train = train.pop('is_trade')
  245. train_index = train.pop('instance_id')
  246. X_train = train
  247. y_test = test_a.pop('is_trade')
  248. test_index = test_a.pop('instance_id')
  249. X_test = test_a
  250. y_val = val_a.pop('is_trade')
  251. val_index = val_a.pop('instance_id')
  252. X_val = val_a
  253. # print(train.head())
  254. category_list = [
  255. 'item_id','shop_id','user_id','user_gender_id','user_age_level',
  256. 'user_occupation_id','user_star_level',
  257. 'item_brand_id', 'item_city_id', 'item_price_level',
  258. 'item_sales_level', 'item_collected_level', 'item_pv_level',
  259. 'shop_review_num_level','shop_star_level','item_category_list_1','item_category_list_2',
  260. 'item_property_list_0','item_property_list_1','item_property_list_2',
  261. 'predict_category_0','predict_category_1','predict_category_2','context_page_id'
  262. ]
  263. def make_cat(data):
  264. for i in category_list:
  265. data[i] = data[i].astype('category')
  266. return data
  267. train_test_val = pd.concat([X_train,X_test])
  268. train_test_val = pd.concat([train_test_val,X_val])
  269. train_test_val = train_test_val.reset_index(drop=True)
  270. # train_test_val = make_cat(train_test_val)
  271. #
  272. # X_train = train_test_val[:X_train.shape[0]]
  273. # X_test = train_test_val[X_train.shape[0]:X_train.shape[0]+X_test.shape[0]]
  274. # X_val = train_test_val[X_train.shape[0]+X_test.shape[0]:]
  275. X_train = make_cat(X_train)
  276. X_test = make_cat(X_test)
  277. X_val = make_cat(X_val)
  278. print(X_train.shape)
  279. print(X_test.shape)
  280. print(X_val.shape)
  281. # X_test = make_cat(X_test)
  282. # X_val = make_cat(X_val)
  283. del X_train['hour_before']
  284. del X_test['hour_before']
  285. del X_val['hour_before']
  286. del X_train['hour_after']
  287. del X_test['hour_after']
  288. del X_val['hour_after']
  289. del X_train['real_day']
  290. del X_test['real_day']
  291. del X_val['real_day']
  292. print(X_train.dtypes)
  293. del X_train['context_timestamp']
  294. del X_test['context_timestamp']
  295. del X_val['context_timestamp']
  296. X_train = X_train[X_train.columns]
  297. X_test = X_test[X_train.columns]
  298. X_val = X_val[X_train.columns]
  299. import lightgbm as lgb
  300. #
  301. # 线下学习
  302. gbm = lgb.LGBMRegressor(objective='binary',
  303. num_leaves=32,
  304. learning_rate=0.01,
  305. n_estimators=2000,
  306. colsample_bytree = 0.65,
  307. subsample = 0.65,
  308. seed=0
  309. )
  310. gbm.fit(X_train,y_train,
  311. eval_set=[(X_val, y_val)],
  312. eval_metric=['binary_logloss'],
  313. early_stopping_rounds= 200)
  314. imp = pd.DataFrame()
  315. imp['n'] = list(X_train.columns)
  316. imp['s'] = list(gbm.feature_importances_)
  317. print(imp.sort_values('s',ascending=False))
  318. print('Start predicting...')
  319. # predict
  320. y_pred_1 = gbm.predict(X_val, num_iteration=gbm.best_iteration)
  321. y_tt = gbm.predict(X_train, num_iteration=gbm.best_iteration)
  322. from sklearn.metrics import log_loss
  323. print(log_loss(y_val,y_pred_1))
  324. print(log_loss(y_train,y_tt))
  325. # 线上提交
  326. gbm_sub = lgb.LGBMRegressor(objective='binary',
  327. num_leaves=32,
  328. learning_rate=0.01,
  329. n_estimators = gbm.best_iteration+1,
  330. colsample_bytree = 0.65,
  331. subsample = 0.65,
  332. seed=0
  333. )
  334. X_train = pd.concat([X_train,X_val])
  335. y_train = pd.concat([y_train,y_val])
  336. X_train = make_cat(X_train)
  337. X_train = X_train[X_train.columns]
  338. gbm_sub.fit(X_train,y_train,
  339. eval_set=[(X_train, y_train)],
  340. eval_metric=['binary_logloss'])
  341. y_sub_1 = gbm_sub.predict(X_test)
  342. y_tt = gbm_sub.predict(X_train, num_iteration=gbm_sub.best_iteration)
  343. from sklearn.metrics import log_loss
  344. print(log_loss(y_train,y_tt))
  345. sub = pd.DataFrame()
  346. sub['instance_id'] = list(test_index)
  347. sub['predicted_score'] = list(y_sub_1)
  348. sub.to_csv('../result/20180409.txt',sep=" ",index=False)
  349. import pandas as pd
  350. import lightgbm as lgb
  351. from sklearn.model_selection import train_test_split
  352. from sklearn.metrics import log_loss
  353. from sklearn import preprocessing
  354. import warnings
  355. warnings.filterwarnings("ignore")
  356. import time
  357. def timestamp_datetime(value):
  358. format = '%Y-%m-%d %H:%M:%S'
  359. value = time.localtime(value)
  360. dt = time.strftime(format, value)
  361. return dt
  362. def base_process(data):
  363. lbl = preprocessing.LabelEncoder()
  364. print(
  365. '--------------------------------------------------------------item--------------------------------------------------------------')
  366. data['len_item_category'] = data['item_category_list'].map(lambda x: len(str(x).split(';')))
  367. data['len_item_property'] = data['item_property_list'].map(lambda x: len(str(x).split(';')))
  368. for i in range(1, 3):
  369. data['item_category_list' + str(i)] = lbl.fit_transform(data['item_category_list'].map(
  370. lambda x: str(str(x).split(';')[i]) if len(str(x).split(';')) > i else '')) # item_category_list的第0列全部都一样
  371. for i in range(10):
  372. data['item_property_list' + str(i)] = lbl.fit_transform(data['item_property_list'].map(lambda x: str(str(x).split(';')[i]) if len(str(x).split(';')) > i else ''))
  373. for col in ['item_id', 'item_brand_id', 'item_city_id']:
  374. data[col] = lbl.fit_transform(data[col])
  375. print(
  376. '--------------------------------------------------------------user--------------------------------------------------------------')
  377. for col in ['user_id']:
  378. data[col] = lbl.fit_transform(data[col])
  379. #print('id:',data[col])
  380. print('user 0,1 feature')
  381. data['gender0'] = data['user_gender_id'].apply(lambda x: 1 if x == -1 else 2)
  382. data['age0'] = data['user_age_level'].apply(lambda x: 1 if x == 1004 | x == 1005 | x == 1006 | x == 1007 else 2)
  383. data['occupation0'] = data['user_occupation_id'].apply(lambda x: 1 if x == -1 | x == 2003 else 2)
  384. data['star0'] = data['user_star_level'].apply(lambda x: 1 if x == -1 | x == 3000 | x == 3001 else 2)
  385. print(
  386. '--------------------------------------------------------------context--------------------------------------------------------------')
  387. data['realtime'] = data['context_timestamp'].apply(timestamp_datetime)
  388. data['realtime'] = pd.to_datetime(data['realtime'])
  389. data['day'] = data['realtime'].dt.day
  390. data['hour'] = data['realtime'].dt.hour
  391. data['len_predict_category_property'] = data['predict_category_property'].map(lambda x: len(str(x).split(';')))
  392. for i in range(5):
  393. data['predict_category_property' + str(i)] = lbl.fit_transform(data['predict_category_property'].map(
  394. lambda x: str(str(x).split(';')[i]) if len(str(x).split(';')) > i else ''))
  395. print('context 0,1 feature')
  396. data['context_page0'] = data['context_page_id'].apply(
  397. lambda x: 1 if x == 4001 | x == 4002 | x == 4003 | x == 4004 | x == 4007 else 2)
  398. print(
  399. '--------------------------------------------------------------shop--------------------------------------------------------------')
  400. for col in ['shop_id']:
  401. data[col] = lbl.fit_transform(data[col])
  402. data['shop_score_delivery0'] = data['shop_score_delivery'].apply(lambda x: 0 if x <= 0.98 and x >= 0.96 else 1)
  403. print(data.shape)
  404. return data
  405. def map_hour(x):
  406. if (x>=7)&(x<=12):
  407. return 1
  408. elif (x>=13)&(x<=20):
  409. return 2
  410. else:
  411. return 3
  412. def deliver(x):
  413. #x=round(x,6)
  414. jiange=0.1
  415. for i in range(1,20):
  416. if (x>=4.1+jiange*(i-1))&(x<=4.1+jiange*i):
  417. return i+1
  418. if x==-5:
  419. return 1
  420. def deliver1(x):
  421. if (x>=2)&(x<=4):
  422. return 1
  423. elif (x>=5)&(x<=7):
  424. return 2
  425. else:
  426. return 3
  427. def review(x):
  428. # x=round(x,6)
  429. jiange = 0.02
  430. for i in range(1, 30):
  431. if (x >= 0.714 + jiange * (i - 1)) & (x <= 0.714 + jiange * i):
  432. return i + 1
  433. if x == -1:
  434. return 1
  435. def review1(x):
  436. # x=round(x,6)
  437. if (x>=2)&(x<=12):
  438. return 1
  439. elif (x>=13)&(x<=15):
  440. return 2
  441. else:
  442. return 3
  443. def service(x):
  444. #x=round(x,6)
  445. jiange=0.1
  446. for i in range(1,20):
  447. if (x>=3.93+jiange*(i-1))&(x<=3.93+jiange*i):
  448. return i+1
  449. if x==-1:
  450. return 1
  451. def service1(x):
  452. if (x>=2)&(x<=7):
  453. return 1
  454. elif (x>=8)&(x<=9):
  455. return 2
  456. else:
  457. return 3
  458. def describe(x):
  459. #x=round(x,6)
  460. jiange=0.1
  461. for i in range(1,30):
  462. if (x>=3.93+jiange*(i-1))&(x<=3.93+jiange*i):
  463. return i+1
  464. if x==-1:
  465. return 1
  466. def describe1(x):
  467. if (x>=2)&(x<=8):
  468. return 1
  469. elif (x>=9)&(x<=10):
  470. return 2
  471. else:
  472. return 3
  473. def shijian(data):
  474. data['hour_map'] = data['hour'].apply(map_hour)
  475. return data
  476. def shop_fenduan(data):
  477. data['shop_score_delivery'] = data['shop_score_delivery'] * 5
  478. data = data[data['shop_score_delivery'] != -5]
  479. data['deliver_map'] = data['shop_score_delivery'].apply(deliver)
  480. data['deliver_map'] = data['deliver_map'].apply(deliver1)
  481. # del data['shop_score_delivery']
  482. print(data.deliver_map.value_counts())
  483. data['shop_score_service'] = data['shop_score_service'] * 5
  484. data = data[data['shop_score_service'] != -5]
  485. data['service_map'] = data['shop_score_service'].apply(service)
  486. data['service_map'] = data['service_map'].apply(service1)
  487. # del data['shop_score_service']
  488. print(data.service_map.value_counts()) # 视为好评,中评,差评
  489. #
  490. data['shop_score_description'] = data['shop_score_description'] * 5
  491. data = data[data['shop_score_description'] != -5]
  492. data['de_map'] = data['shop_score_description'].apply(describe)
  493. data['de_map'] = data['de_map'].apply(describe1)
  494. # del data['shop_score_description']
  495. print(data.de_map.value_counts())
  496. data = data[data['shop_review_positive_rate'] != -1]
  497. data['review_map'] = data['shop_review_positive_rate'].apply(review)
  498. data['review_map'] = data['review_map'].apply(review1)
  499. print(data.review_map.value_counts())
  500. data['normal_shop'] = data.apply(
  501. lambda x: 1 if (x.deliver_map == 3) & (x.service_map == 3) & (x.de_map == 3) & (x.review_map == 3) else 0,
  502. axis=1)
  503. del data['de_map']
  504. del data['service_map']
  505. del data['deliver_map']
  506. del data['review_map']
  507. return data
  508. def slide_cnt(data):
  509. # item_cnt = data.groupby(by='item_id').count()['instance_id'].to_dict()
  510. # data['item_cnt'] = data['item_id'].apply(lambda x: item_cnt[x])
  511. # user_cnt = data.groupby(by='user_id').count()['instance_id'].to_dict()
  512. # data['user_cnt'] = data['user_id'].apply(lambda x: user_cnt[x])
  513. # shop_cnt = data.groupby(by='shop_id').count()['instance_id'].to_dict()
  514. # data['shop_cnt'] = data['shop_id'].apply(lambda x: shop_cnt[x])
  515. print('当前日期前一天的cnt')
  516. for d in range(19, 26): # 18到24号
  517. df1 = data[data['day'] == d - 1]
  518. df2 = data[data['day'] == d] # 19到25号
  519. user_cnt = df1.groupby(by='user_id').count()['instance_id'].to_dict()
  520. item_cnt = df1.groupby(by='item_id').count()['instance_id'].to_dict()
  521. shop_cnt = df1.groupby(by='shop_id').count()['instance_id'].to_dict()
  522. df2['user_cnt1'] = df2['user_id'].apply(lambda x: user_cnt.get(x, 0))
  523. df2['item_cnt1'] = df2['item_id'].apply(lambda x: item_cnt.get(x, 0))
  524. df2['shop_cnt1'] = df2['shop_id'].apply(lambda x: shop_cnt.get(x, 0))
  525. df2 = df2[['user_cnt1', 'item_cnt1', 'shop_cnt1', 'instance_id']]
  526. if d == 19:
  527. Df2 = df2
  528. else:
  529. Df2 = pd.concat([df2, Df2])
  530. data = pd.merge(data, Df2, on=['instance_id'], how='left')
  531. print('当前日期之前的cnt')
  532. for d in range(19, 26):
  533. # 19到25,25是test
  534. df1 = data[data['day'] < d]
  535. df2 = data[data['day'] == d]
  536. user_cnt = df1.groupby(by='user_id').count()['instance_id'].to_dict()
  537. item_cnt = df1.groupby(by='item_id').count()['instance_id'].to_dict()
  538. shop_cnt = df1.groupby(by='shop_id').count()['instance_id'].to_dict()
  539. df2['user_cntx'] = df2['user_id'].apply(lambda x: user_cnt.get(x, 0))
  540. df2['item_cntx'] = df2['item_id'].apply(lambda x: item_cnt.get(x, 0))
  541. df2['shop_cntx'] = df2['shop_id'].apply(lambda x: shop_cnt.get(x, 0))
  542. df2 = df2[['user_cntx', 'item_cntx', 'shop_cntx', 'instance_id']]
  543. if d == 19:
  544. Df2 = df2
  545. else:
  546. Df2 = pd.concat([df2, Df2])
  547. data = pd.merge(data, Df2, on=['instance_id'], how='left')
  548. print("前一个小时的统计量")
  549. return data
  550. def zuhe(data):
  551. for col in ['user_gender_id','user_age_level','user_occupation_id','user_star_level']:
  552. data[col] = data[col].apply(lambda x: 0 if x == -1 else x)
  553. for col in ['item_sales_level', 'item_price_level', 'item_collected_level',
  554. 'user_gender_id','user_age_level','user_occupation_id','user_star_level',
  555. 'shop_review_num_level', 'shop_star_level']:
  556. data[col] = data[col].astype(str)
  557. print('item两两组合')
  558. data['sale_price'] = data['item_sales_level'] + data['item_price_level']
  559. data['sale_collect'] = data['item_sales_level'] + data['item_collected_level']
  560. data['price_collect'] = data['item_price_level'] + data['item_collected_level']
  561. print('user两两组合')
  562. data['gender_age'] = data['user_gender_id'] + data['user_age_level']
  563. data['gender_occ'] = data['user_gender_id'] + data['user_occupation_id']
  564. data['gender_star'] = data['user_gender_id'] + data['user_star_level']
  565. print('shop两两组合')
  566. data['review_star'] = data['shop_review_num_level'] + data['shop_star_level']
  567. for col in ['item_sales_level', 'item_price_level', 'item_collected_level', 'sale_price','sale_collect', 'price_collect',
  568. 'user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level','gender_age','gender_occ','gender_star',
  569. 'shop_review_num_level','shop_star_level','review_star']:
  570. data[col] = data[col].astype(int)
  571. del data['review_star']
  572. return data
  573. def item(data):
  574. print('一个item有多少brand,price salse collected level……')
  575. itemcnt = data.groupby(['item_id'], as_index=False)['instance_id'].agg({'item_cnt': 'count'})
  576. data = pd.merge(data, itemcnt, on=['item_id'], how='left')
  577. for col in ['item_brand_id','item_city_id', 'item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level']:
  578. itemcnt = data.groupby([col, 'item_id'], as_index=False)['instance_id'].agg({str(col) + '_item_cnt': 'count'})
  579. data = pd.merge(data, itemcnt, on=[col, 'item_id'], how='left')
  580. data[str(col) + '_item_prob']=data[str(col) + '_item_cnt']/data['item_cnt']
  581. del data['item_cnt']
  582. print('一个brand有多少price salse collected level……')
  583. itemcnt = data.groupby(['item_brand_id'], as_index=False)['instance_id'].agg({'item_brand_cnt': 'count'})
  584. data = pd.merge(data, itemcnt, on=['item_brand_id'], how='left')
  585. for col in ['item_city_id', 'item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level']:
  586. itemcnt = data.groupby([col, 'item_brand_id'], as_index=False)['instance_id'].agg({str(col) + '_brand_cnt': 'count'})
  587. data = pd.merge(data, itemcnt, on=[col, 'item_brand_id'], how='left')
  588. data[str(col) + '_brand_prob'] = data[str(col) + '_brand_cnt'] / data['item_brand_cnt']
  589. del data['item_brand_cnt']
  590. print('一个city有多少item_price_level,item_sales_level,item_collected_level,item_pv_level')
  591. itemcnt = data.groupby(['item_city_id'], as_index=False)['instance_id'].agg({'item_city_cnt': 'count'})
  592. data = pd.merge(data, itemcnt, on=['item_city_id'], how='left')
  593. for col in ['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level']:
  594. itemcnt = data.groupby([col, 'item_city_id'], as_index=False)['instance_id'].agg({str(col) + '_city_cnt': 'count'})
  595. data = pd.merge(data, itemcnt, on=[col, 'item_city_id'], how='left')
  596. data[str(col) + '_city_prob'] = data[str(col) + '_city_cnt'] / data['item_city_cnt']
  597. del data['item_city_cnt']
  598. print('一个price有多少item_sales_level,item_collected_level,item_pv_level')
  599. itemcnt = data.groupby(['item_price_level'], as_index=False)['instance_id'].agg({'item_price_cnt': 'count'})
  600. data = pd.merge(data, itemcnt, on=['item_price_level'], how='left')
  601. for col in ['item_sales_level', 'item_collected_level', 'item_pv_level']:
  602. itemcnt = data.groupby([col, 'item_city_id'], as_index=False)['instance_id'].agg({str(col) + '_price_cnt': 'count'})
  603. data = pd.merge(data, itemcnt, on=[col, 'item_city_id'], how='left')
  604. data[str(col) + '_price_prob'] = data[str(col) + '_price_cnt'] / data['item_price_cnt']
  605. del data['item_price_cnt']
  606. print('一个item_sales_level有多少item_collected_level,item_pv_level')
  607. itemcnt = data.groupby(['item_sales_level'], as_index=False)['instance_id'].agg({'item_salse_cnt': 'count'})
  608. data = pd.merge(data, itemcnt, on=['item_sales_level'], how='left')
  609. for col in ['item_collected_level', 'item_pv_level']:
  610. itemcnt = data.groupby([col, 'item_sales_level'], as_index=False)['instance_id'].agg({str(col) + '_salse_cnt': 'count'})
  611. data = pd.merge(data, itemcnt, on=[col, 'item_sales_level'], how='left')
  612. data[str(col) + '_salse_prob'] = data[str(col) + '_salse_cnt'] / data['item_salse_cnt']
  613. del data['item_salse_cnt']
  614. print('一个item_collected_level有多少item_pv_level')
  615. itemcnt = data.groupby(['item_collected_level'], as_index=False)['instance_id'].agg({'item_coll_cnt': 'count'})
  616. data = pd.merge(data, itemcnt, on=['item_collected_level'], how='left')
  617. for col in ['item_pv_level']:
  618. itemcnt = data.groupby([col, 'item_collected_level'], as_index=False)['instance_id'].agg({str(col) + '_coll_cnt': 'count'})
  619. data = pd.merge(data, itemcnt, on=[col, 'item_collected_level'], how='left')
  620. data[str(col) + '_coll_prob'] = data[str(col) + '_coll_cnt'] / data['item_coll_cnt']
  621. del data['item_coll_cnt']
  622. return data
  623. def user(data):
  624. print('用户有多少性别')
  625. itemcnt = data.groupby(['user_id'], as_index=False)['instance_id'].agg({'user_cnt': 'count'})
  626. data = pd.merge(data, itemcnt, on=['user_id'], how='left')
  627. for col in ['user_gender_id','user_age_level', 'user_occupation_id', 'user_star_level']:
  628. itemcnt = data.groupby([col, 'user_id'], as_index=False)['instance_id'].agg({str(col) + '_user_cnt': 'count'})
  629. data = pd.merge(data, itemcnt, on=[col, 'user_id'], how='left')
  630. data[str(col) + '_user_prob']=data[str(col) + '_user_cnt']/data['user_cnt']
  631. del data['user_cnt']
  632. print('性别的年龄段,职业有多少')
  633. itemcnt = data.groupby(['user_gender_id'], as_index=False)['instance_id'].agg({'user_gender_cnt': 'count'})
  634. data = pd.merge(data, itemcnt, on=['user_gender_id'], how='left')
  635. for col in ['user_age_level', 'user_occupation_id', 'user_star_level']:
  636. itemcnt = data.groupby([col, 'user_gender_id'], as_index=False)['instance_id'].agg({str(col) + '_user_gender_cnt': 'count'})
  637. data = pd.merge(data, itemcnt, on=[col, 'user_gender_id'], how='left')
  638. data[str(col) + '_user_gender_prob']=data[str(col) + '_user_gender_cnt']/data['user_gender_cnt']
  639. del data['user_gender_cnt']
  640. print('user_age_level对应的user_occupation_id,user_star_level')
  641. itemcnt = data.groupby(['user_age_level'], as_index=False)['instance_id'].agg({'user_age_cnt': 'count'})
  642. data = pd.merge(data, itemcnt, on=['user_age_level'], how='left')
  643. for col in ['user_occupation_id', 'user_star_level']:
  644. itemcnt = data.groupby([col, 'user_age_level'], as_index=False)['instance_id'].agg({str(col) + '_user_age_cnt': 'count'})
  645. data = pd.merge(data, itemcnt, on=[col, 'user_age_level'], how='left')
  646. data[str(col) + '_user_age_prob']=data[str(col) + '_user_age_cnt']/data['user_age_cnt']
  647. del data['user_age_cnt']
  648. print('user_occupation_id对应的user_star_level')
  649. itemcnt = data.groupby(['user_occupation_id'], as_index=False)['instance_id'].agg({'user_occ_cnt': 'count'})
  650. data = pd.merge(data, itemcnt, on=['user_occupation_id'], how='left')
  651. for col in ['user_star_level']:
  652. itemcnt = data.groupby([col, 'user_occupation_id'], as_index=False)['instance_id'].agg({str(col) + '_user_occ_cnt': 'count'})
  653. data = pd.merge(data, itemcnt, on=[col, 'user_occupation_id'], how='left')
  654. data[str(col) + '_user_occ_prob']=data[str(col) + '_user_occ_cnt']/data['user_occ_cnt']
  655. del data['user_occ_cnt']
  656. return data
  657. def user_item(data):
  658. itemcnt = data.groupby(['user_id'], as_index=False)['instance_id'].agg({'user_cnt': 'count'})
  659. data = pd.merge(data, itemcnt, on=['user_id'], how='left')
  660. print('一个user有多少item_id,item_brand_id……')
  661. for col in ['item_id',
  662. 'item_brand_id','item_city_id','item_price_level',
  663. 'item_sales_level','item_collected_level','item_pv_level']:
  664. item_shop_cnt = data.groupby([col, 'user_id'], as_index=False)['instance_id'].agg({str(col)+'_user_cnt': 'count'})
  665. data = pd.merge(data, item_shop_cnt, on=[col, 'user_id'], how='left')
  666. data[str(col) + '_user_prob'] = data[str(col) + '_user_cnt'] / data['user_cnt']
  667. print('一个user_gender有多少item_id,item_brand_id……')
  668. itemcnt = data.groupby(['user_gender_id'], as_index=False)['instance_id'].agg({'user_gender_cnt': 'count'})
  669. data = pd.merge(data, itemcnt, on=['user_gender_id'], how='left')
  670. for col in ['item_id',
  671. 'item_brand_id','item_city_id','item_price_level',
  672. 'item_sales_level','item_collected_level','item_pv_level']:
  673. item_shop_cnt = data.groupby([col, 'user_gender_id'], as_index=False)['instance_id'].agg({str(col)+'_user_gender_cnt': 'count'})
  674. data = pd.merge(data, item_shop_cnt, on=[col, 'user_gender_id'], how='left')
  675. data[str(col) + '_user_gender_prob'] = data[str(col) + '_user_gender_cnt'] / data['user_gender_cnt']
  676. print('一个user_age_level有多少item_id,item_brand_id……')
  677. itemcnt = data.groupby(['user_age_level'], as_index=False)['instance_id'].agg({'user_age_cnt': 'count'})
  678. data = pd.merge(data, itemcnt, on=['user_age_level'], how='left')
  679. for col in ['item_id',
  680. 'item_brand_id','item_city_id','item_price_level',
  681. 'item_sales_level','item_collected_level','item_pv_level']:
  682. item_shop_cnt = data.groupby([col, 'user_age_level'], as_index=False)['instance_id'].agg({str(col)+'_user_age_cnt': 'count'})
  683. data = pd.merge(data, item_shop_cnt, on=[col, 'user_age_level'], how='left')
  684. data[str(col) + '_user_age_prob'] = data[str(col) + '_user_age_cnt'] / data['user_age_cnt']
  685. print('一个user_occupation_id有多少item_id,item_brand_id…')
  686. itemcnt = data.groupby(['user_occupation_id'], as_index=False)['instance_id'].agg({'user_occ_cnt': 'count'})
  687. data = pd.merge(data, itemcnt, on=['user_occupation_id'], how='left')
  688. for col in ['item_id',
  689. 'item_brand_id','item_city_id','item_price_level',
  690. 'item_sales_level','item_collected_level','item_pv_level']:
  691. item_shop_cnt = data.groupby([col, 'user_occupation_id'], as_index=False)['instance_id'].agg({str(col)+'_user_occ_cnt': 'count'})
  692. data = pd.merge(data, item_shop_cnt, on=[col, 'user_occupation_id'], how='left')
  693. data[str(col) + '_user_occ_prob'] = data[str(col) + '_user_occ_cnt'] / data['user_occ_cnt']
  694. return data
  695. def user_shop(data):
  696. print('一个user有多少shop_id,shop_review_num_level……')
  697. for col in ['shop_id', 'shop_review_num_level', 'shop_star_level']:
  698. item_shop_cnt = data.groupby([col, 'user_id'], as_index=False)['instance_id'].agg(
  699. {str(col) + '_user_cnt': 'count'})
  700. data = pd.merge(data, item_shop_cnt, on=[col, 'user_id'], how='left')
  701. data[str(col) + '_user_prob'] = data[str(col) + '_user_cnt'] / data['user_cnt']
  702. del data['user_cnt']
  703. print('一个user_gender有多少shop_id,shop_review_num_level……')
  704. for col in ['shop_id', 'shop_review_num_level', 'shop_star_level']:
  705. item_shop_cnt = data.groupby([col, 'user_gender_id'], as_index=False)['instance_id'].agg(
  706. {str(col) + '_user_gender_cnt': 'count'})
  707. data = pd.merge(data, item_shop_cnt, on=[col, 'user_gender_id'], how='left')
  708. data[str(col) + '_user_gender_prob'] = data[str(col) + '_user_gender_cnt'] / data['user_gender_cnt']
  709. del data['user_gender_cnt']
  710. print('一个user_age_level有多少shop_id,shop_review_num_level……')
  711. for col in ['shop_id', 'shop_review_num_level', 'shop_star_level']:
  712. item_shop_cnt = data.groupby([col, 'user_age_level'], as_index=False)['instance_id'].agg(
  713. {str(col) + '_user_age_cnt': 'count'})
  714. data = pd.merge(data, item_shop_cnt, on=[col, 'user_age_level'], how='left')
  715. data[str(col) + '_user_age_prob'] = data[str(col) + '_user_age_cnt'] / data['user_age_cnt']
  716. del data['user_age_cnt']
  717. print('一个user_occupation_id有多少shop_id,shop_review_num_level……')
  718. for col in ['shop_id', 'shop_review_num_level', 'shop_star_level']:
  719. item_shop_cnt = data.groupby([col, 'user_occupation_id'], as_index=False)['instance_id'].agg(
  720. {str(col) + '_user_occ_cnt': 'count'})
  721. data = pd.merge(data, item_shop_cnt, on=[col, 'user_occupation_id'], how='left')
  722. data[str(col) + '_user_occ_prob'] = data[str(col) + '_user_occ_cnt'] / data['user_occ_cnt']
  723. del data['user_occ_cnt']
  724. return data
  725. def shop_item(data):
  726. print('一个shop有多少item_id,item_brand_id,item_city_id,item_price_level……')
  727. itemcnt = data.groupby(['shop_id'], as_index=False)['instance_id'].agg({'shop_cnt': 'count'})
  728. data = pd.merge(data, itemcnt, on=['shop_id'], how='left')
  729. for col in ['item_id',
  730. 'item_brand_id','item_city_id','item_price_level',
  731. 'item_sales_level','item_collected_level','item_pv_level']:
  732. item_shop_cnt = data.groupby([col, 'shop_id'], as_index=False)['instance_id'].agg({str(col)+'_shop_cnt': 'count'})
  733. data = pd.merge(data, item_shop_cnt, on=[col, 'shop_id'], how='left')
  734. data[str(col) + '_shop_prob'] = data[str(col) + '_shop_cnt'] / data['shop_cnt']
  735. del data['shop_cnt']
  736. print('一个shop_review_num_level有多少item_id,item_brand_id,item_city_id,item_price_level……')
  737. itemcnt = data.groupby(['shop_review_num_level'], as_index=False)['instance_id'].agg({'shop_rev_cnt': 'count'})
  738. data = pd.merge(data, itemcnt, on=['shop_review_num_level'], how='left')
  739. for col in ['item_id',
  740. 'item_brand_id','item_city_id','item_price_level',
  741. 'item_sales_level','item_collected_level','item_pv_level']:
  742. item_shop_cnt = data.groupby([col, 'shop_review_num_level'], as_index=False)['instance_id'].agg({str(col)+'_shop_rev_cnt': 'count'})
  743. data = pd.merge(data, item_shop_cnt, on=[col, 'shop_review_num_level'], how='left')
  744. data[str(col) + '_shop_rev_prob'] = data[str(col) + '_shop_rev_cnt'] / data['shop_rev_cnt']
  745. del data['shop_rev_cnt']
  746. # print('一个shop_star_level有多少item_id,item_brand_id,item_city_id,item_price_level……')
  747. # itemcnt = data.groupby(['shop_star_level'], as_index=False)['instance_id'].agg({'shop_star_cnt': 'count'})
  748. # data = pd.merge(data, itemcnt, on=['shop_star_level'], how='left')
  749. # for col in ['item_id',
  750. # 'item_brand_id', 'item_city_id', 'item_price_level',
  751. # 'item_sales_level', 'item_collected_level', 'item_pv_level']:
  752. # item_shop_cnt = data.groupby([col, 'shop_star_level'], as_index=False)['instance_id'].agg({str(col) + '_shop_star_cnt': 'count'})
  753. # data = pd.merge(data, item_shop_cnt, on=[col, 'shop_star_level'], how='left')
  754. # data[str(col) + '_shop_star_prob'] = data[str(col) + '_shop_star_cnt'] / data['shop_star_cnt']
  755. # del data['shop_star_cnt']
  756. return data
  757. def lgbCV(train, test):
  758. col = [c for c in train if
  759. c not in ['is_trade', 'item_category_list', 'item_property_list', 'predict_category_property', 'instance_id',
  760. 'context_id', 'realtime', 'context_timestamp']]
  761. # cat = ['sale_price', 'gender_star', 'user_age_level', 'item_price_level', 'item_sales_level', 'sale_collect',
  762. # 'price_collect', 'item_brand_id', 'user_star_level', 'item_id', 'shop_id',
  763. # 'item_city_id', 'context_page_id', 'gender_age', 'shop_star_level', 'item_pv_level', 'user_occupation_id',
  764. # 'day', 'gender_occ', 'user_gender_id']
  765. X = train[col]
  766. y = train['is_trade'].values
  767. X_tes = test[col]
  768. y_tes = test['is_trade'].values
  769. print('Training LGBM model...')
  770. lgb0 = lgb.LGBMClassifier(
  771. objective='binary',
  772. # metric='binary_error',
  773. num_leaves=35,
  774. max_depth=8,
  775. learning_rate=0.05,
  776. seed=2018,
  777. colsample_bytree=0.8,
  778. # min_child_samples=8,
  779. subsample=0.9,
  780. n_estimators=20000)
  781. lgb_model = lgb0.fit(X, y, eval_set=[(X_tes, y_tes)], early_stopping_rounds=200)
  782. best_iter = lgb_model.best_iteration_
  783. predictors = [i for i in X.columns]
  784. feat_imp = pd.Series(lgb_model.feature_importances_, predictors).sort_values(ascending=False)
  785. print(feat_imp)
  786. print(feat_imp.shape)
  787. # pred= lgb_model.predict(test[col])
  788. pred = lgb_model.predict_proba(test[col])[:, 1]
  789. test['pred'] = pred
  790. test['index'] = range(len(test))
  791. # print(test[['is_trade','pred']])
  792. print('误差 ', log_loss(test['is_trade'], test['pred']))
  793. return best_iter
  794. def sub(train, test, best_iter):
  795. col = [c for c in train if
  796. c not in ['is_trade', 'item_category_list', 'item_property_list', 'predict_category_property', 'instance_id',
  797. 'context_id', 'realtime', 'context_timestamp']]
  798. X = train[col]
  799. y = train['is_trade'].values
  800. print('Training LGBM model...')
  801. lgb0 = lgb.LGBMClassifier(
  802. objective='binary',
  803. # metric='binary_error',
  804. num_leaves=35,
  805. max_depth=8,
  806. learning_rate=0.05,
  807. seed=2018,
  808. colsample_bytree=0.8,
  809. # min_child_samples=8,
  810. subsample=0.9,
  811. n_estimators=best_iter)
  812. lgb_model = lgb0.fit(X, y)
  813. predictors = [i for i in X.columns]
  814. feat_imp = pd.Series(lgb_model.feature_importances_, predictors).sort_values(ascending=False)
  815. print(feat_imp)
  816. print(feat_imp.shape)
  817. # pred= lgb_model.predict(test[col])
  818. pred = lgb_model.predict_proba(test[col])[:, 1]
  819. test['predicted_score'] = pred
  820. sub1 = test[['instance_id', 'predicted_score']]
  821. sub=pd.read_csv("input/test.txt", sep="\s+")
  822. sub=pd.merge(sub,sub1,on=['instance_id'],how='left')
  823. sub=sub.fillna(0)
  824. #sub[['instance_id', 'predicted_score']].to_csv('result/result0320.csv',index=None,sep=' ')
  825. sub[['instance_id', 'predicted_score']].to_csv('result/result0326.txt',sep=" ",index=False)
  826. if __name__ == "__main__":
  827. train = pd.read_csv("input/train.txt", sep="\s+")
  828. test = pd.read_csv("input/test.txt", sep="\s+")
  829. data = pd.concat([train, test])
  830. data = data.drop_duplicates(subset='instance_id') # 把instance id去重
  831. print('make feature')
  832. data = base_process(data)
  833. data=shijian(data)
  834. data=shop_fenduan(data)
  835. data = slide_cnt(data)
  836. data = zuhe(data)
  837. print('----------------------------全局统计特征---------------------------------------------------')
  838. data = item(data)
  839. data = user(data)
  840. data = user_item(data)
  841. data = user_shop(data)
  842. data=shop_item(data)
  843. print(data.head(0))
  844. "----------------------------------------------------线下----------------------------------------"
  845. train= data[(data['day'] >= 18) & (data['day'] <= 23)]
  846. test= data[(data['day'] == 24)]
  847. best_iter = lgbCV(train, test)
  848. "----------------------------------------------------线上----------------------------------------"
  849. train = data[data.is_trade.notnull()]
  850. test = data[data.is_trade.isnull()]
  851. sub(train, test, best_iter)
添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注