Google Play Store Apps 下載量分析

大數據分析與應用-蔡孟勳課程期末專題
組員：陳彥妤、林旭容

資料集
Google Play Store Apps | Kaggle
研究動機
分析APP的各項資料與下載量的關係，了解哪些因素對下載量的影響較大。提供App開發者一項依據來設定App的推出方向。
模型建立
Decision Tree, Random Forest, KNN, ANN

資料預處理

處理缺失值
將有缺失值的資料整筆刪除

data.dropna(inplace=True)

類別型資料轉換
利用LabelEncoder將類別型資料轉換成數值

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['Category'] = le.fit_transform(data['Category']) 
data['Content Rating'] = le.fit_transform(data['Content Rating']) 
data['Type'] = le.fit_transform(data['Type'])
data['Genres'] = le.fit_transform(data['Genres'])
data['Last Updated'] = le.fit_transform(data['Last Updated']) 
data['Current Ver'] = le.fit_transform(data['Current Ver']) 
data['Android Ver'] = le.fit_transform(data['Android Ver'])

個別欄位處理

Size

# 把資料中的"M"去掉
df_size['Size'] = df_size['Size'].map(lambda x: str(x)[:-1])
# String轉float
df_size['Size'] = df_size['Size'].astype('float')
# 標記1~100M, 100M~
for i in range(len(df_size)):
  if df_size['Size'].iat[i] < 100:
    df_size['Size'].iat[i] = '2'
  else:
    df_size['Size'].iat[i] = '3'
# 合併Dataframe
data = pd.concat([data,df_size])
data['Size'] = data['Size'].astype('int')

Install

# 把資料中的"+"去掉
data['Installs'] = data['Installs'].map(lambda x: str(x)[:-1])  # 把資料中的","去掉
data['Installs'] = data['Installs'].str.replace(',', '')  # String轉Int
data['Installs'] = data['Installs'].astype('int')

# 再利用replace()將Installs分割成4個區間(1 ~ 5000+, 100000+ ~ 5000000+, 10000000+ ~ 500000000+, 1000000000+)
data['Installs'] = data['Installs'].replace([1,5,10,50,100,500,1000,5000], 0)
data['Installs'] = data['Installs'].replace([10000,50000,100000,500000,1000000,5000000], 1)
data['Installs'] = data['Installs'].replace([10000000,50000000,100000000,500000000], 2)
data['Installs'] = data['Installs'].replace([1000000000], 3)

Price

# 把資料中的"$"去掉
data['Price'] = data['Price'].str.lstrip('$')
# String轉float
data['Price'] = data['Price'].astype('float')

資料標準化 & 去除離群值
Reviews欄位做z-score標準化，再去除離群值(z>3)

data['Reviews'] = data['Reviews'].astype('int')
mean = data['Reviews'].mean()
std = data['Reviews'].std()
data['Reviews'] = data['Reviews'].apply(lambda x: (x-mean)/ std)

for i in range(len(data)):
  if data['Reviews'].iat[i]>3:
    data.drop(labels=i)
data = data.reset_index(drop=True)

特徵選取
使用PCA(0.95)配合Info Gain與Gain Ratio做特徵選取

from sklearn.decomposition import PCA
pca = PCA(n_components=0.95, whiten=True)
features_pca = pca.fit_transform(features)
print("Original number of features:", features.shape[1])
print("Reduce number of features:", features_pca.shape[1])

# Info Gain與Gain Ratio
!pip install info_gain

# Installs欄位為target column
X=data.drop(['Installs'],axis=1)
y=data.Installs

from info_gain import info_gain
import pandas as pd
print('info_gain:')
infogain={}
for i in data.columns:
  ig = info_gain.info_gain(data[i], data['Installs'])
  infogain[i]=ig
a=sorted(infogain.items(),key=lambda item:item[1])
for i in a:
  print(i)
print('\ngain_ratio:')
gainratio={}
for i in data.columns:
  igr = info_gain.info_gain_ratio(data[i], data['Installs'])
  gainratio[i]=igr
b=sorted(gainratio.items(),key=lambda item:item[1])
for i in b:
  print(i)

# 綜合PCA和Info Gain/Gain Ratio結果，刪除'Type', 'Content Rating'兩個欄位
data = data.drop(labels=['Type', 'Content Rating'],axis=1)

模型建立

Decision Tree, Random Forest, KNN, ANN

Decision Tree

from sklearn import model_selection, tree, metrics
dtc = tree.DecisionTreeClassifier()
dtc.fit(X_train, y_train)
print(metrics.classification_report(y_true=y_test, y_pred=dtc.predict(X_test)))
metrics.confusion_matrix(y_true=y_test, y_pred=dtc.predict(X_test))

import pydotplus
from IPython.display import Image

feature_names = ['Category', 'Rating', 'Reviews', 'Size', 'Price', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
traget_name = ['0','1','2','3']
dot_data = tree.export_graphviz(dtc, out_file=None, rounded=True, special_characters=True, feature_names=feature_names, class_names=traget_name, max_depth=3)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())

Random Forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve,roc_auc_score,auc,accuracy_score,confusion_matrix,classification_report
rfc=RandomForestClassifier(n_estimators=5)
rfc.fit(X_train,y_train)
rfc_pred=rfc.predict(X_test)
print(confusion_matrix(y_test,rfc_pred))
print(classification_report(y_test,rfc_pred))

imp=rfc.feature_importances_
names=data.columns
zip(imp,names)
imp, names= zip(*sorted(zip(imp,names)))
plt.barh(range(len(names)),imp,align='center')
plt.yticks(range(len(names)),names)
plt.xlabel('Importance of Features')
plt.ylabel('Features')
plt.title('Importance of Each Feature')
plt.show()

KNN

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import roc_curve,roc_auc_score,auc,accuracy_score,confusion_matrix,classification_report
import pydotplus
from IPython.display import Image

error_rate=[]
for i in range(1,10):
  knn=KNeighborsClassifier(n_neighbors=i)
  knn.fit(X_train,y_train)
  pred_i=knn.predict(X_test)
  error_rate.append(np.mean(pred_i!=y_test))

plt.figure(figsize=(10,6))
plt.plot(range(1,10),error_rate,color='blue',linestyle='dashed',marker='o',markerfacecolor='red',markersize=10)
plt.title('Error rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

# with k=9
knn=KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train,y_train)
pred=knn.predict(X_test)
print('WITH k=9')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))

ANN

import pandas as pd
from sklearn import preprocessing, neural_network, model_selection
mms = preprocessing.MinMaxScaler()
mlp = neural_network.MLPClassifier()
mlp.fit(X_train, y_train)
#mlp.predict([[0, 4.1, 0, 0, 0, 9, 561, 2582, 8]])
mlp.score(X_test, y_test)

結果比較

ROC curve 看出Random Forest的準確度較高

Random Forest Visulization 發現 Rating和下載量的高低最相關

Name		Name	Last commit message	Last commit date
Latest commit History 39 Commits
images		images
README.md		README.md
google_play_store.ipynb		google_play_store.ipynb
googleplaystore.csv		googleplaystore.csv

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

Google Play Store Apps 下載量分析

資料預處理

模型建立

結果比較

About

Releases

Packages

Languages

yyc556/Analysis-of-Install-Counts-on-Google-Play-Store

Folders and files

Latest commit

History

Repository files navigation

Google Play Store Apps 下載量分析

資料預處理

模型建立

結果比較

About

Topics

Resources

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages