forked from ontoportal/ncbo_cron
-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add a script to count graphs in triple store
- Loading branch information
1 parent
af5b4b5
commit 2481b26
Showing
2 changed files
with
109 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
#!/usr/bin/env ruby | ||
|
||
# Exit cleanly from an early interrupt | ||
Signal.trap("INT") { exit 1 } | ||
|
||
# Set up the bundled gems in our environment | ||
require 'bundler/setup' | ||
|
||
# Configure the process for the current cron configuration. | ||
require_relative '../lib/ncbo_cron' | ||
config_exists = File.exist?(File.expand_path('../../config/config.rb', __FILE__)) | ||
abort("Please create a config/config.rb file using the config/config.rb.sample as a template") unless config_exists | ||
require_relative '../config/config' | ||
|
||
require 'optparse' | ||
options = {} | ||
opt_parser = OptionParser.new do |opts| | ||
opts.on('--store FILEPATH', 'save the results in this file') do |filepath| | ||
options[:savefile] = filepath | ||
end | ||
options[:logfile] = "logs/graph_counts_generation.log" | ||
opts.on( '-l', '--logfile FILE', "Write log to FILE (default is 'bulk_load_mapping.log')" ) do |filename| | ||
options[:logfile] = filename | ||
end | ||
opts.on( '-h', '--help', 'Display this screen' ) do | ||
puts opts | ||
exit | ||
end | ||
end | ||
opt_parser.parse! | ||
|
||
logger = Logger.new(options[:logfile]) | ||
puts "Processing details are logged to #{options[:logfile]}" | ||
unless options[:savefile] | ||
logger.error "Please provide a file to save the results" | ||
exit | ||
end | ||
|
||
generator = NcboCron::GraphsCounts.new | ||
generator.run(logger, options[:savefile]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
#!/usr/bin/env ruby | ||
require 'logger' | ||
require 'optparse' | ||
|
||
module NcboCron | ||
module GraphsCounts | ||
SUBMISSION_DATA_GRAPH = 'http://data\.bioontology\.org/ontologies/[^/]+/submissions/\d+' | ||
|
||
def run(logger, file_path) | ||
logger.info('Start generating graphs counts') | ||
logger.info('Fetch ontologies data graphs') | ||
@all_ontologies = LinkedData::Models::Ontology.all | ||
@all_subs = all_ontologies.map{|x| x.latest_submission(status: any)} | ||
|
||
logger.info('Fetch all graphs URIs') | ||
graphs = graphs_list | ||
result = {} | ||
graphs.each do |graph| | ||
logger.info("Calculate the triple count of #{graph}") | ||
result[graph] = [graph_count_triples(graph), zombie_graph?(graph)] | ||
end | ||
|
||
logger.info("Save the result in file #{file_path}") | ||
save_result_in_file(file_path, result) | ||
logger.info('Finish generating graphs counts') | ||
end | ||
|
||
|
||
private | ||
def save_result_in_file(file_path, results) | ||
File.open(file_path, 'w') do |f| | ||
f.write(results.to_json) | ||
end | ||
end | ||
|
||
def zombie_graph?(graph) | ||
regex = Regexp.new(SUBMISSION_DATA_GRAPH) | ||
return false unless regex.match?(url) | ||
|
||
!@all_subs.find{ |x| x.id.to_s == graph.to_s }.present? | ||
end | ||
|
||
def graph_count_triples(graph) | ||
query = <<-eos | ||
SELECT (COUNT(?s) as ?count) WHERE { | ||
GRAPH #{graph.to_ntriples} { | ||
?s ?p ?v | ||
}} | ||
eos | ||
rs = Goo.sparql_query_client.query(query) | ||
count = 0 | ||
rs.each do |sol| | ||
count = sol[:count].object | ||
end | ||
count | ||
end | ||
|
||
def graphs_list | ||
query = <<-eos | ||
SELECT DISTINCT ?g WHERE { | ||
GRAPH ?g { | ||
?s ?p ?v | ||
}} | ||
eos | ||
rs = Goo.sparql_query_client.query(query) | ||
rs.solutions.map { |x| x[:g].to_s } | ||
end | ||
end | ||
end |