Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: create script to count graphs #28

Merged
merged 4 commits into from
Jan 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions bin/ncbo_cron
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,26 @@ runner.execute do |opts|
end
end

if options[:enable_ontologies_report]
graphs_report_thread = Thread.new do
graphs_count_report_options = options.dup
graphs_count_report_options[:job_name] = "ncbo_cron_graphs_counts_report"
graphs_count_report_options[:scheduler_type] = :cron
graphs_count_report_options[:cron_schedule] = graphs_count_report_options[:cron_ontologies_report]
logger.info "Setting up ontologies report generation job with #{graphs_count_report_options[:cron_ontologies_report]}"
logger.info "Writing ontologies report into #{graphs_count_report_options[:ontology_report_path]}"; logger.flush
ontologies_report_log_path = File.join(log_path, "#{log_filename_noExt}-graph-counts-report.log")
ontologies_report_logger = Logger.new(ontologies_report_log_path)
NcboCron::Scheduler.scheduled_locking_job(graphs_count_report_options) do
logger.info "Starting graph counts report generation"; logger.flush
logger.info "Logging graph counts report generation details to #{ontologies_report_log_path}"; logger.flush
t0 = Time.now
NcboCron::GraphsCounts.new(ontologies_report_logger, NcboCron.settings.graph_counts_report_path).run
logger.info "Graph counts report generation job completed in #{Time.now - t0} sec."; logger.flush
logger.info "Finished graph counts report generation"; logger.flush
end
end
end
if options[:enable_index_synchronizer]
index_synchronizer_thread = Thread.new do
index_synchronizer_options = options.dup
Expand Down
37 changes: 37 additions & 0 deletions bin/ncbo_generate_graph_count
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/usr/bin/env ruby

# Exit cleanly from an early interrupt
Signal.trap("INT") { exit 1 }

# Set up the bundled gems in our environment
require 'bundler/setup'

# Configure the process for the current cron configuration.
require_relative '../lib/ncbo_cron'
config_exists = File.exist?(File.expand_path('../../config/config.rb', __FILE__))
abort("Please create a config/config.rb file using the config/config.rb.sample as a template") unless config_exists
require_relative '../config/config'
require_relative '../lib/ncbo_cron/graphs_counts'

require 'optparse'
options = {}
opt_parser = OptionParser.new do |opts|
options[:savefile] = NcboCron.settings.graphs_counts_report_path
opts.on('--store FILEPATH', 'save the results in this file') do |filepath|
options[:savefile] = filepath
end
options[:logfile] = STDOUT
opts.on( '-l', '--logfile FILE', "Write log to FILE (default is 'bulk_load_mapping.log')" ) do |filename|
options[:logfile] = filename
end
opts.on( '-h', '--help', 'Display this screen' ) do
puts opts
exit
end
end
opt_parser.parse!

logger = Logger.new(options[:logfile])
puts "Processing details are logged to #{options[:logfile]}"

NcboCron::GraphsCounts.new(logger, options[:savefile]).run
6 changes: 6 additions & 0 deletions lib/ncbo_cron/config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ def config(&block)
@settings.enable_mapping_counts ||= true
# enable ontology analytics
@settings.enable_ontology_analytics ||= true
# enable graph count
@settings.enable_graphs_counts ||= true
# enable ontologies report
@settings.enable_ontologies_report ||= true
# enable index synchronization
Expand Down Expand Up @@ -76,6 +78,10 @@ def config(&block)
@settings.cron_ontologies_report ||= "30 1 * * *"
# Ontologies Report file location
@settings.ontology_report_path = "../../reports/ontologies_report.json"

# Ontologies Report file location
@settings.graph_counts_report_path = "../../reports/graph_counts.json"

# Index synchronizer schedule
# 30 3 */2 * * - run every 2 days at 3:30AM
@settings.cron_index_synchronizer ||= "30 3 */2 * *"
Expand Down
83 changes: 83 additions & 0 deletions lib/ncbo_cron/graphs_counts.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#!/usr/bin/env ruby
require 'logger'
require 'optparse'

module NcboCron
class GraphsCounts
SUBMISSION_DATA_GRAPH = 'http://data\.bioontology\.org/ontologies/[^/]+/submissions/\d+'
DATA_SAVE = '/srv/ontoportal/data/reports/graph_counts.json'

attr_reader :logger, :file_path

def initialize(logger = nil, file_path = nil)
@file_path ||= DATA_SAVE
@logger = logger || Logger.new(STDOUT)
end

def read_graph_counts
return {} unless File.exist?(file_path)

JSON.parse(File.read(file_path))
end

def run
logger.info('Start generating graphs counts')
logger.info('Fetch ontologies data graphs')
@all_ontologies = LinkedData::Models::Ontology.all
@all_subs = @all_ontologies.map{|x| x.latest_submission(status: :any)}.compact

logger.info('Fetch all graphs URIs')
graphs = graphs_list
result = {}
graphs.each do |graph|
logger.info("Calculate the triple count of #{graph}")
result[graph] = [graph_count_triples(graph), zombie_graph?(graph)]
end

logger.info("Save the result in file #{file_path}")
save_result_in_file(file_path, result)
logger.info('Finish generating graphs counts')
end


private
def save_result_in_file(file_path, results)
File.open(file_path, 'w') do |f|
f.write(results.to_json)
end
end

def zombie_graph?(graph)
regex = Regexp.new(SUBMISSION_DATA_GRAPH)
return false unless regex.match?(graph)

!@all_subs.find{ |x| x.id.to_s == graph.to_s }.present?
end

def graph_count_triples(graph)
query = <<-eos
SELECT (COUNT(?s) as ?count) WHERE {
GRAPH <#{graph}> {
?s ?p ?v
}}
eos
rs = Goo.sparql_query_client.query(query)
count = 0
rs.each_solution do |sol|
count = sol[:count].object
end
count
end

def graphs_list
query = <<-eos
SELECT DISTINCT ?g WHERE {
GRAPH ?g {
?s ?p ?v
}}
eos
rs = Goo.sparql_query_client.query(query)
rs.each_solution.map { |x| x[:g].to_s }
end
end
end
Loading