Skip to content

Commit

Permalink
完成社区化分析的部分
Browse files Browse the repository at this point in the history
  • Loading branch information
bestony committed Oct 2, 2018
1 parent 878e79c commit 34c886b
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 7 deletions.
4 changes: 2 additions & 2 deletions grank/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from .libs import helpers
from .libs import query
from .script import activity,crawler
from .script import activity,crawler,social

@click.group()
def main():
Expand Down Expand Up @@ -60,6 +60,7 @@ def repo(organization,repo):
config = helpers.get_config()
data = crawler.fetch_repo_data(organization,repo,config)
activity.analyse_repo(organization,repo,data,config)
social.analyse_repo(organization,repo,data,config)
pass

@main.command()
Expand All @@ -79,6 +80,5 @@ def clean():
helpers.clean_directory()
pass


if __name__ == '__main__':
main()
43 changes: 40 additions & 3 deletions grank/libs/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,19 +157,34 @@ def export_csv(series,name):
"""导出 Csv 文件"""
series.to_csv("output/%s.csv" % name);

def get_activity_avarage_instance():
def get_activity_average_instance():
"""获取平均值 DF 实例"""
if not os.path.isfile("output/activity_average.pkl"):
pd.DataFrame(data={'name': [], 'score': []}).to_pickle("output/activity_average.pkl")
return pd.read_pickle("output/activity_average.pkl")

def set_activity_avarage(instance,owner,repository,score):
def get_social_average_instance():
"""获取活跃度平均值 DF 实例"""
if not os.path.isfile("output/social_average.pkl"):
pd.DataFrame(data={'name': [], 'score': []}).to_pickle("output/social_average.pkl")
return pd.read_pickle("output/social_average.pkl")

def set_activity_average(instance,owner,repository,score):
"""保存中间值,并更新 csv 文件"""
instance = instance.append(pd.Series({"owner":owner,"name":repository,"score":score}),ignore_index=True)
instance = instance.drop_duplicates(subset=["owner","name"]).sort_values(["score"],ascending=False)
instance.to_pickle("output/activity_average.pkl")
instance.to_csv("result/activity_rank.csv")

def set_social_average(instance,owner,repository,score):
"""保存中间值,并更新 csv 文件"""

instance = instance.append(pd.Series({"owner":owner,"name":repository,"score":score}),ignore_index=True)
instance = instance.drop_duplicates(subset=["owner","name"]).sort_values(["score"],ascending=False)

instance.to_pickle("output/social_average.pkl")
instance.to_csv("result/social_rank.csv")

def series_to_pickle(df,name):
"""将数据保存到 pickle 中"""
df.to_pickle("output/%s.pkl" % name)
Expand All @@ -189,9 +204,31 @@ def generate_activity_line_number(start_time,end_time,top_number):
fig.savefig("result/activity_line.png")
plt.close(fig)

def generate_social_line_number(start_time,end_time,top_number):
"""生成平均值的折线图"""
df = pd.read_pickle("output/social_average.pkl")
all_df = pd.DataFrame(data=[],index=pd.date_range(start=start_time,end=end_time,freq="W"))

for index, row in df.iterrows():
if len(all_df.columns) < top_number:
all_df[row["name"]] = pd.read_pickle("output/social_%s.pkl" % row["name"])["score"]
else:
break

fig = all_df.plot().get_figure()
fig.savefig("result/social_line.png")
plt.close(fig)

def clean_directory():
"""清空临时目录及结果目录"""
shutil.rmtree('output',ignore_errors=True)
shutil.rmtree('result',ignore_errors=True)
click.echo("Workspace is empty now!")
pass
pass

def is_corp(email,config):
"""判断是否是企业用户"""
if config["corp"]["keyword"] in email:
return True
else:
return False
4 changes: 2 additions & 2 deletions grank/script/activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,15 +73,15 @@ def analyse_repo(owner,repository,data,config):

# 获取平均分实例,用于后续排序

instance = helpers.get_activity_avarage_instance()
instance = helpers.get_activity_average_instance()

# 将项目的活跃分数保存到新的 Pickle 中,用于后续的折线图输出

helpers.series_to_pickle(new_df,repository)

# 对平均分实例进行排序

helpers.set_activity_avarage(instance,owner,repository,target_score)
helpers.set_activity_average(instance,owner,repository,target_score)

# 输出项目的 CSV 数据
helpers.export_csv(new_df,"%s" % repository)
Expand Down
62 changes: 62 additions & 0 deletions grank/script/social.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from ..libs import query
from ..libs import helpers
import pandas as pd
import numpy as np
import click
import math
def analyse_repo(owner,repository,data,config):
click.echo("分析社区化")
pullRequestArray = data["pullRequestArray"]
commitArray = data["commitArray"]

start_time=config["time"]["start_time"]
end_time=config["time"]["end_time"]
date_range = pd.date_range(start=start_time,end=end_time,freq="W")
date_series = pd.Series(np.zeros((len(date_range),), dtype=int),index=date_range)


social_all_frame= pd.DataFrame(commitArray)
social_all_frame = social_all_frame[social_all_frame.date != "未标注时间"]
social_all_frame["date"] = pd.to_datetime(social_all_frame['date'])
for index,row in social_all_frame.iterrows():
social_all_frame.loc[index,"author"] = helpers.is_corp(row["author"],config)

community_df = social_all_frame[social_all_frame.author != True].set_index('date').resample('W')['times'].sum()
social_all_df = social_all_frame.set_index('date').resample('W')['times'].sum()

social_all_df = social_all_df.loc[start_time:end_time]
community_df = community_df.loc[start_time:end_time]

temp_community_series = pd.Series(np.zeros((len(date_range),), dtype=int),index=date_range)
temp_social_series = pd.Series(np.zeros((len(date_range),), dtype=int),index=date_range)

for item in community_df.index:
if item in date_series.index:
temp_community_series[item] = community_df[item]

for item in social_all_df.index:
if item in date_series.index:
temp_social_series[item] = social_all_df[item]

social_df = pd.DataFrame({
"community_member":temp_community_series.values,
"all_member":temp_social_series.values,
},index = date_range)
social_df = social_df.cumsum()
social_df["score"] = social_df.apply(lambda row: row.community_member / row.all_member , axis=1)

target_social_score = social_df["score"].sum() / len(social_df)

instance = helpers.get_social_average_instance()

helpers.series_to_pickle(social_df,"social_%s" % repository)

helpers.set_social_average(instance,owner,repository,target_social_score)

helpers.export_csv(social_df,"social_%s" % repository)

helpers.generate_social_line_number(start_time,end_time,int(config["rank"]["top"]))

print(target_social_score)
exit()
pass

0 comments on commit 34c886b

Please sign in to comment.