diff --git a/.gitignore b/.gitignore index 172c006..3b99ee0 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,6 @@ venv/ # ignore py compiled etc. files *.pyc *.pyo + +# ignore .idea +.idea/ diff --git a/config/defaults.py b/config/defaults.py index cf153d9..c697437 100644 --- a/config/defaults.py +++ b/config/defaults.py @@ -84,11 +84,18 @@ MUTPB_EN_OUT_LINK = 0.5 # probability to add an outgoing triple (otherwise in) MUTPB_AE = 0.2 # prob to try adding an edge between two nodes MUTPB_ID = 0.05 # prob to increase distance between source and target by 1 hop -MUTPB_FV = 0.4 # prob to fix a variable (SPARQL) +MUTPB_FV = 0.25 # prob to fix a variable (SPARQL) MUTPB_FV_RGTP_SAMPLE_N = 128 # sample <= n remaining GTPs to fix variables for MUTPB_FV_SAMPLE_MAXN = 32 # max n of instantiations to sample from top k MUTPB_FV_QUERY_LIMIT = 256 # SPARQL query limit for the top k instantiations MUTPB_SP = 0.05 # prob to simplify pattern (warning: can restrict exploration) +# TODO: Lower the MUTPB_DN +MUTPB_DN = 0.6 # prob to try adding a deep and narrow path to a pattern +MUTPB_DN_MAX_HOPS = 10 # Max number of hops in the deep narrow path +MUTPB_DN_MAX_HOPS_ALPHA = 1.15 # alpha value in a length beta distribution +MUTPB_DN_MAX_HOPS_BETA = 1.85 # beta value in a length beta distribution +MUTPB_DN_AVG_DEG_LIMIT = 10 # Max avg. reachable Nodes +MUTPB_DN_MAX_HOP_INST = 10 # Max number of hop instances for the next query/ies # fusion of target candidates: FUSION_SAMPLES_PER_CLASS = 500 # only use up to n training samples per class diff --git a/gp_learner.py b/gp_learner.py index 008310f..d183978 100644 --- a/gp_learner.py +++ b/gp_learner.py @@ -54,6 +54,8 @@ from gp_query import query_stats from gp_query import query_time_hard_exceeded from gp_query import query_time_soft_exceeded +from gp_query import deep_narrow_path_query +from gp_query import deep_narrow_path_inst_query from gp_query import variable_substitution_query from graph_pattern import canonicalize from graph_pattern import gen_random_var @@ -685,6 +687,142 @@ def mutate_fix_var( return res +def mutate_deep_narrow_path( + sparql, + timeout, + gtp_scores, + child, + directions=None, + child_in_queries=False, + limit=None, # TODO: Use a limit for the queries? +): + """ Finds n-hop-connections from Source to Target, to add them to a given + Graph-Pattern. + + The outline of the mutation is as follows: + - If not evaluated, evaluates the given GP to work on its matching-node- + pairs + - If not passed in, randomly selects the path-length and the directions + of the single hops. + - Issues SPARQL queries, to find hops (from Source and Target), that don't + have a big fan-out (smaller than the default-value). Uses an default max- + amount of found hops to find the next hop. + When there is only one hop left to find, it tries to instanciate paths, + that fit to an STP. If such a path is found, its hops are added to the GP. + As there could be more than one path, the mutation returns a list of such + patterns. + + :param directions: list of directions to use for the hops + (1: Source -> Target, -1: Target -> Source, + 0 (or everything else): choose random) + :param child_in_queries: If true: add the triples of the given pattern to + the queries + :param limit: SPARQL limnit + :return: list of children in which a deep_narrow_path is added + """ + if not child.fitness.valid: + ev = evaluate( + sparql, timeout, gtp_scores, child, run=-1, gen=-1) + update_individuals([child], [ev]) + gtps = child.matching_node_pairs + if not gtps: + return [child] + if directions: + n = len(directions) - 1 + else: + alpha = config.MUTPB_DN_MAX_HOPS_ALPHA + beta = config.MUTPB_DN_MAX_HOPS_BETA + max_hops = config.MUTPB_DN_MAX_HOPS + # more likely to create shorter paths + # with default values the distribution is as follows: + # PDF: 1: 14 %, 2: 27 %, 3: 25 %, 4: 17 %, 5: 10 %, 6: 5 %, 7: 1.5 %, ... + # CDF: 1: 14 %, 2: 40 %, 3: 66 %, 4: 83 %, 5: 93 %, 6: 98 %, 7: 99,6 %, ... + n = int(random.betavariate(alpha, beta) * max_hops + 1) + nodes = [SOURCE_VAR] + [Variable('n%d' % i) for i in range(n)] + [TARGET_VAR] + hops = [Variable('p%d' % i) for i in range(n + 1)] + if not directions: + directions = [0 for _ in range(n + 1)] + directions = [ + random.choice([-1, 1]) if d not in [-1, 1] else d for d in directions + ] + gp_hops = [ + # directions[i] == 1 => hop in the direction source -> target + GraphPattern([(nodes[i], hops[i], nodes[i + 1])]) if directions[i] == 1 + # directions[i] == -1 => hop in the direction target -> source + else GraphPattern([(nodes[i + 1], hops[i], nodes[i])]) + for i in range(n+1) + ] + # queries to get the first n hops: + valueblocks_s = {} + valueblocks_t = {} + for i in range(n // 2 + 1): + if i < int(n/2): + t, q_res = deep_narrow_path_query( + sparql, + timeout, + child, + hops[i], + nodes[i+1], + valueblocks_s, + gp_hops[:i + 1], + SOURCE_VAR, + gp_in=child_in_queries, + ) + if not q_res: + return [child] + valueblocks_s[hops[i]] = { + (hops[i],): random.sample( + [(q_r,) for q_r in q_res], + min(config.MUTPB_DN_MAX_HOP_INST, len(q_res)) + ) + } + if n-i > i: + t, q_res = deep_narrow_path_query( + sparql, + timeout, + child, + hops[n-i], + nodes[n-i], + valueblocks_t, + gp_hops[n - i:], + TARGET_VAR, + gp_in=child_in_queries, + ) + if not q_res: + return [child] + valueblocks_t[hops[n-i]] = { + (hops[n-i],): random.sample( + [(q_r,) for q_r in q_res], + min(config.MUTPB_DN_MAX_HOP_INST, len(q_res)) + ) + } + + # query to get the last hop and instantiations, that connect source and + # target + valueblocks = {} + valueblocks.update(valueblocks_s) + valueblocks.update(valueblocks_t) + t, q_res = deep_narrow_path_inst_query( + sparql, + timeout, + child, + hops, + valueblocks, + gp_hops, + gp_in=child_in_queries + ) + if not q_res: + return [child] + res = [ + child + GraphPattern([ + (nodes[i], qr[i], nodes[i + 1]) if directions[i] == 1 + else (nodes[i + 1], qr[i], nodes[i]) + for i in range(n + 1) + ]) for qr in q_res + ] + return res + + def mutate_simplify_pattern(gp): if len(gp) < 2: return gp @@ -797,6 +935,7 @@ def mutate( pb_mv=config.MUTPB_MV, pb_sp=config.MUTPB_SP, pb_sv=config.MUTPB_SV, + pb_dn=config.MUTPB_DN, ): # mutate patterns: # grow: select random identifier and convert them into a var (local) @@ -835,10 +974,10 @@ def mutate( child = canonicalize(child) children = mutate_fix_var(sparql, timeout, gtp_scores, child) else: - children = [child] - - - # TODO: deep & narrow paths mutation + if random.random() < pb_dn: + children = mutate_deep_narrow_path(sparql, timeout, gtp_scores, child) + else: + children = [child] children = { c if fit_to_live(c) else orig_child diff --git a/gp_query.py b/gp_query.py index 0a4618d..1763ece 100644 --- a/gp_query.py +++ b/gp_query.py @@ -62,6 +62,8 @@ def __init__(self): self.ask_multi_query_count = 0 self.combined_ask_count_multi_query_count = 0 self.variable_substitution_query_count = 0 + self.useful_path_query_count = 0 + self.useful_path_inst_query_count = 0 self.predict_query_count = 0 self.count_query_count = 0 @@ -695,6 +697,144 @@ def _var_subst_chunk_result_ext(q_res, _sel_var_and_vars, _, **kwds): def _var_subst_res_update(res, update, **_): res += update + + +def deep_narrow_path_query( + sparql, + timeout, + graph_pattern, + var_to_fix, + var_to_count, + valueblocks, + steps, + startvar, + avglimit=config.MUTPB_DN_AVG_DEG_LIMIT, + gp_in=False, + batch_size=None +): + _query_stats.useful_path_query_count += 1 + # TODO: maybe batch_size = batch_size - 10 * number of valueblocks for hops + _values = graph_pattern.matching_node_pairs + # TODO: maybe use not good covered stp + _ret_val_mapping = {stp: [stp] for stp in graph_pattern.matching_node_pairs} + _vars_steps_and_stuff = ( + var_to_fix, var_to_count, startvar, valueblocks, steps, avglimit, gp_in + ) + return _multi_query( + sparql, timeout, graph_pattern, graph_pattern.matching_node_pairs, + batch_size, _vars_steps_and_stuff, _values, _ret_val_mapping, + _deep_narrow_path_res_init, _deep_narrow_path_chunk_q, + _deep_narrow_path_chunk_result_ext, _deep_narrow_path_res_update + ) + + +# noinspection PyUnusedLocal +def _deep_narrow_path_res_init(_, **kwds): + return [] + + +def _deep_narrow_path_chunk_q(gp, _vars_steps_and_stuff, values_chunk): + var_to_fix, var_to_count, startvar, _valueblocks, steps, avglimit, gp_in \ + = _vars_steps_and_stuff + valueblocks = { + startvar: { + (startvar,): + [(tup[0],) for tup in values_chunk] if startvar == SOURCE_VAR + else [(tup[1],) for tup in values_chunk] + } + } + valueblocks.update(_valueblocks) + return gp.to_sparql_deep_narrow_path_query( + var_to_fix, + var_to_count, + valueblocks, + steps, + startvar, + avglimit=avglimit, + gp_in=gp_in + ) + + +# noinspection PyUnusedLocal +def _deep_narrow_path_chunk_result_ext(q_res, _vars_steps_and_stuff, _, **kwds): + var_to_fix, var_to_count, startvar, _valueblocks, steps, avglimit, gp_in \ + = _vars_steps_and_stuff + chunk_res = [] + res_rows_path = ['results', 'bindings'] + bindings = sparql_json_result_bindings_to_rdflib( + get_path(q_res, res_rows_path, default=[]) + ) + for row in bindings: + # TODO: Maybe return the avg-degree too + chunk_res.append(get_path(row, [var_to_fix])) + return chunk_res + + +def _deep_narrow_path_res_update(res, update, **_): + res += update + + +def deep_narrow_path_inst_query( + sparql, + timeout, + graph_pattern, + hop, + valueblocks, + steps, + gp_in=False, + batch_size=None +): + _query_stats.useful_path_inst_query_count += 1 + # TODO: maybe batch_size = batch_size - 10 * number of valueblocks for hops + _values = graph_pattern.matching_node_pairs + # TODO: maybe use not good covered stp + _ret_val_mapping = {stp: [stp] for stp in graph_pattern.matching_node_pairs} + _vars_steps_and_stuff = (hop, valueblocks, steps, gp_in) + return _multi_query( + sparql, timeout, graph_pattern, graph_pattern.matching_node_pairs, + batch_size, _vars_steps_and_stuff, _values, _ret_val_mapping, + _deep_narrow_path_inst_res_init, _deep_narrow_path_inst_chunk_q, + _deep_narrow_path_inst_chunk_result_ext, + _deep_narrow_path_inst_res_update + ) + + +# noinspection PyUnusedLocal +def _deep_narrow_path_inst_res_init(_, **kwds): + return [] + + +def _deep_narrow_path_inst_chunk_q(gp, _vars_steps_and_stuff, values_chunk): + hop, _valueblocks, steps, gp_in = _vars_steps_and_stuff + valueblocks = { + 'st': { + (SOURCE_VAR, TARGET_VAR): values_chunk + } + } + valueblocks.update(_valueblocks) + return gp.to_sparql_deep_narrow_path_inst_query( + hop, valueblocks, steps, gp_in=gp_in + ) + + +# noinspection PyUnusedLocal +def _deep_narrow_path_inst_chunk_result_ext( + q_res, _vars_steps_and_stuff, _, **kwds +): + hop, _valueblocks, steps, gp_in = _vars_steps_and_stuff + chunk_res = [] + res_rows_path = ['results', 'bindings'] + bindings = sparql_json_result_bindings_to_rdflib( + get_path(q_res, res_rows_path, default=[]) + ) + + for row in bindings: + chunk_res.append([get_path(row, [h]) for h in hop]) + return chunk_res + + +def _deep_narrow_path_inst_res_update(res, update, **_): + res += update def generate_stps_from_gp(sparql, gp): diff --git a/graph_pattern.py b/graph_pattern.py index a483c88..d46f654 100644 --- a/graph_pattern.py +++ b/graph_pattern.py @@ -245,7 +245,7 @@ def canonicalize(gp, shorten_varnames=True): and len(gp.nodes) == len(cgp.nodes) and len(gp.edges) == len(cgp.edges) and sorted(gp.identifier_counts().values()) == - sorted(cgp.identifier_counts().values()) + sorted(cgp.identifier_counts().values()) ): # canonicalization should never change any of the features above, but it # did before (e.g., https://github.com/RDFLib/rdflib/issues/494 ). @@ -636,6 +636,107 @@ def to_sparql_select_query( res = textwrap.dedent(res) return self._sparql_prefix(res) + + def to_sparql_deep_narrow_path_query( + self, + var_to_fix, + var_to_count, + valueblocks, + steps, + startvar, + avglimit=10, + gp_in=False + ): + # TODO: Maybe use a limit + count_var_to_count = Variable('c' + ''.join(var_to_count)) + avg_var_to_count = Variable('avgc' + ''.join(var_to_count)) + res = "SELECT %(vtf)s (AVG(%(cvtc)s) as %(avtc)s) {\n" \ + "SELECT %(stv)s %(vtf)s (COUNT (%(vtc)s) as %(cvtc)s) {\n" \ + "%(val)s" \ + "%(trip)s }\n" \ + "GROUP BY %(stv)s %(vtf)s }\n" \ + "GROUP BY %(vtf)s\n" \ + "HAVING (AVG (%(cvtc)s) < %(avgl)s)" % { + 'vtf': ''.join(var_to_fix.n3()), + 'cvtc': ''.join(count_var_to_count.n3()), + 'avtc': ''.join(avg_var_to_count.n3()), + 'stv': ''.join(startvar.n3()), + 'vtc': ''.join(var_to_count.n3()), + 'val': ''.join([ + 'VALUES (%s) {\n%s }\n' % ( + ' '.join(var.n3() for var in valueblocks[key].keys()[0]), + ''.join(['(%s)\n' % + ' '.join(self.curify(v) for v in vt) + for vt in valueblocks[key][(key,)]]) + ) for key in valueblocks.keys() + ]), + 'trip': ''.join([ + '%s %s %s .\n' % (s.n3(), p.n3(), o.n3()) + for step in steps + for s, p, o in step + ]) + ''.join([ + self._sparql_triples_part(indent=' ') if gp_in else '' + ]), + 'avgl': str(avglimit), + } + res = textwrap.dedent(res) + return self._sparql_prefix(res) + + + def to_sparql_deep_narrow_path_inst_query( + self, + hop, + valueblocks, + steps, + gp_in=False + ): + # TODO: Maybe use a limit + res = "SELECT %(vtf)s (COUNT (?source) as ?cst) {\n" \ + "%(val)s" \ + "%(trip)s }\n" \ + "GROUP BY %(vtf)s\n" \ + "HAVING (COUNT (?source) > 0)" % { + 'vtf': ' '.join([var.n3() for var in hop]), + 'val': ''.join([ + 'VALUES (%s) {\n%s }\n' % ( + ' '.join(var.n3() for var in valueblocks[key].keys()[0]), + ''.join(['(%s)\n' % + ' '.join(self.curify(v) for v in vt) + for vt in valueblocks[key].values()[0]]) + ) for key in valueblocks.keys() + ]), + 'trip': ''.join([ + '%s %s %s .\n' % (s.n3(), p.n3(), o.n3()) + for step in steps + for s, p, o in step + ]) + ''.join([ + self._sparql_triples_part(indent=' ') if gp_in else '' + ]), + } + res = textwrap.dedent(res) + return self._sparql_prefix(res) + + def to_sparql_precheck_query( + self, + values, + gp_in=False + ): + res = "SELECT * {\n" \ + "%(val)s\n" \ + "%(trip)s\n" \ + "}\n" \ + "LIMIT 1" % { + 'val': ''.join( + self._sparql_values_part(values=values, indent=' ') + ), + 'trip': ''.join(self._sparql_triples_part(indent=' ')) + + ''.join([ + self._sparql_triples_part(indent=' ') if gp_in else '' + ]), + } + res = textwrap.dedent(res) + return self._sparql_prefix(res) + def to_sparql_ask_query( self, bind=None, @@ -656,9 +757,9 @@ def _sparql_query_pattern_part( ): assert bind is None or isinstance(bind, dict) assert values is None or ( - isinstance(values, dict) and - isinstance(next(six.iterkeys(values)), Iterable) and - isinstance(next(six.itervalues(values)), Iterable) + isinstance(values, dict) and + isinstance(next(six.iterkeys(values)), Iterable) and + isinstance(next(six.itervalues(values)), Iterable) ) res = '' @@ -1042,7 +1143,6 @@ def rate_graph_pattern(self, gp): ] return res - def prune_counts(self, below=2): lns = len(self.identifier_gt_node_sum) ln = len(self.identifier_gt_node_count) @@ -1069,7 +1169,7 @@ def prune_counts(self, below=2): def __str__(self): return '%s: pairs: %d, nodes: %d, Identifier counts:\n' \ - 'Pairs: %s\nNodes: %s' % ( - self.__class__.__name__, len(self.gt_pairs), len(self.nodes), - self.identifier_gt_pair_count, self.identifier_gt_node_count - ) + 'Pairs: %s\nNodes: %s' % ( + self.__class__.__name__, len(self.gt_pairs), len(self.nodes), + self.identifier_gt_pair_count, self.identifier_gt_node_count + ) diff --git a/tests/test_mutate_deep_narrow.py b/tests/test_mutate_deep_narrow.py new file mode 100644 index 0000000..b636735 --- /dev/null +++ b/tests/test_mutate_deep_narrow.py @@ -0,0 +1,204 @@ +# coding=utf-8 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +"""Testet die verschiedenen Versionen der mutatete_deep_narrow +""" + +import logging +import numpy as np +import pickle +import random +from collections import defaultdict +from collections import OrderedDict +from os import getenv + +import SPARQLWrapper +from itertools import chain +from splendid import get_path +from splendid import time_func +import socket +import rdflib +from rdflib import BNode +from rdflib import Literal +from rdflib import URIRef +from rdflib import Variable + +from config import SPARQL_ENDPOINT +from gp_learner import evaluate +from gp_learner import mutate_deep_narrow_path +from gp_learner import mutate_fix_var +from gp_learner import update_individuals +from gp_query import calibrate_query_timeout +from gp_query import query_time_hard_exceeded +from gp_query import query_time_soft_exceeded +from graph_pattern import gen_random_var +from graph_pattern import GraphPattern +from graph_pattern import SOURCE_VAR +from graph_pattern import TARGET_VAR +from ground_truth_tools import get_semantic_associations +from ground_truth_tools import split_training_test_set +from gtp_scores import GTPScores +from serialization import print_graph_pattern +from utils import sparql_json_result_bindings_to_rdflib + +logger = logging.getLogger(__name__) + +sparql = SPARQLWrapper.SPARQLWrapper(SPARQL_ENDPOINT) +# sparql = SPARQLWrapper.SPARQLWrapper( +# getenv('SPARQL_ENDPOINT', 'http://dbpedia.org/sparql')) +try: + timeout = max(5, calibrate_query_timeout(sparql)) # 5s for warmup +except IOError: + from nose import SkipTest + raise SkipTest( + "Can't establish connection to SPARQL_ENDPOINT:\n %s\n" + "Skipping tests in\n %s" % (sparql.endpoint, __file__)) + +dbr = rdflib.Namespace('http://dbpedia.org/resource/') +owl = rdflib.Namespace('http://www.w3.org/2002/07/owl#') +dbo = rdflib.Namespace('http://dbpedia.org/ontology/') +gold = rdflib.Namespace('http://purl.org/linguistics/gold') +dbt = rdflib.Namespace('http://dbpedia.org/resource/Template:') +dbp = rdflib.Namespace('http://dbpedia.org/property/') + +v = [gen_random_var() for i in range(100)] + +sameAs = owl['sameAs'] +pwl = dbo['wikiPageWikiLink'] +hypernym = gold['hypernym'] +wpUseTemp = dbp['wikiPageUsesTemplate'] + +gp_found = {} +gp_found['1'] = GraphPattern([ + (SOURCE_VAR, pwl, TARGET_VAR), + (SOURCE_VAR, v[0], v[1]), + (v[1], hypernym, TARGET_VAR) +]) +gp_found['2'] = GraphPattern([ + (SOURCE_VAR, pwl, TARGET_VAR), + (TARGET_VAR, v[0], SOURCE_VAR), + (TARGET_VAR, v[1], URIRef('http://dbpedia.org/dbtax/Page')) +]) +gp_found['3'] = GraphPattern([ + (SOURCE_VAR, pwl, TARGET_VAR), + (TARGET_VAR, v[0], SOURCE_VAR), + (TARGET_VAR, v[1], dbt['Sister_project_links']) +]) +gp_found['4'] = GraphPattern([ + (SOURCE_VAR, pwl, TARGET_VAR), + (TARGET_VAR, wpUseTemp, dbt['Pp-semi-indef']) +]) +gp_found['5'] = GraphPattern([ + (SOURCE_VAR, pwl, TARGET_VAR), + (TARGET_VAR, v[0], dbt['Pp-semi-indef']) +]) +gp_found['6'] = GraphPattern([ + (SOURCE_VAR, pwl, TARGET_VAR), + (TARGET_VAR, v[0], SOURCE_VAR), + (TARGET_VAR, v[1], dbt['Cite_book']) +]) +gp_found['7'] = GraphPattern([ + (SOURCE_VAR, pwl, TARGET_VAR), + (TARGET_VAR, v[0], SOURCE_VAR), + (TARGET_VAR, v[1], dbt['Redirect']) +]) +gp_found['8'] = GraphPattern([ + (SOURCE_VAR, hypernym, TARGET_VAR) +]) +gp_found['50'] = GraphPattern([ + (SOURCE_VAR, pwl, TARGET_VAR), + (TARGET_VAR, v[0], SOURCE_VAR), + (TARGET_VAR, v[1], dbt['Use_dmy_dates']) +]) +gp_found['51'] = GraphPattern([ + (SOURCE_VAR, pwl, TARGET_VAR), + (TARGET_VAR, v[0], SOURCE_VAR), + (TARGET_VAR, v[1], dbt['Refend']) +]) +gp_found['52'] = GraphPattern([ + (SOURCE_VAR, pwl, TARGET_VAR), + (TARGET_VAR, URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), + URIRef('http://dbpedia.org/dbtax/Page')) +]) +gp_found['54'] = GraphPattern([ + (SOURCE_VAR, hypernym, TARGET_VAR), + (v[0], sameAs, SOURCE_VAR) +]) +gp_found['55'] = GraphPattern([ + (SOURCE_VAR, hypernym, TARGET_VAR), + (TARGET_VAR, pwl, SOURCE_VAR) +]) +gp_found['67'] = GraphPattern([ + (SOURCE_VAR, pwl, TARGET_VAR), + (TARGET_VAR, v[0], SOURCE_VAR), + (TARGET_VAR, v[1], dbt['Portal']) +]) +gp_found['68'] = GraphPattern([ + (SOURCE_VAR, pwl, TARGET_VAR), + (TARGET_VAR, v[0], SOURCE_VAR), + (TARGET_VAR, v[1], dbt['Convert']) +]) +gp_found['69'] = GraphPattern([ + (SOURCE_VAR, hypernym, TARGET_VAR), + (v[0], hypernym, SOURCE_VAR) +]) +gp_found['72'] = GraphPattern([ + (SOURCE_VAR, URIRef('http://purl.org/dc/terms/subject'), v[1]), + (TARGET_VAR, pwl, SOURCE_VAR), + (v[0], sameAs, v[1]), + (v[1], URIRef('http://www.w3.org/2004/02/skos/core#subject'), TARGET_VAR) +]) +gp_found['94'] = GraphPattern([ + (SOURCE_VAR, URIRef('http://purl.org/dc/terms/subject'), v[1]), + (TARGET_VAR, v[0], SOURCE_VAR), + (v[1], URIRef('http://www.w3.org/2004/02/skos/core#subject'), TARGET_VAR) +]) +gp_found['131'] = GraphPattern([ + (SOURCE_VAR, v[0], v[2]), + (TARGET_VAR, pwl, v[1]), + (v[2], URIRef('http://www.w3.org/2004/02/skos/core#subject'), TARGET_VAR), +]) +gp_found['140'] = GraphPattern([ + (TARGET_VAR, pwl, SOURCE_VAR), + (TARGET_VAR, wpUseTemp, dbt['Other_uses']), + (TARGET_VAR, wpUseTemp, dbt['Pp-move-indef']), + (v[0], URIRef('http://www.w3.org/2000/01/rdf-schema#seeAlso'), TARGET_VAR), +]) +# Bis hier jedes mit neuem Fingerprint, jetzt noch 3 vom Rest +gp_found['231'] = GraphPattern([ + (SOURCE_VAR, dbo['class'], TARGET_VAR), + (TARGET_VAR, dbp['subdivisionRanks'], v[0]) +]) +gp_found['323'] = GraphPattern([ + (SOURCE_VAR, pwl, TARGET_VAR), + (v[0], dbp['species'], TARGET_VAR), + (v[1], dbo['wikiPageDisambiguates'], TARGET_VAR) +]) +gp_found['516'] = GraphPattern([ + (SOURCE_VAR, pwl, v[1]), + (TARGET_VAR, dbp['image'], v[0]), + (v[1], hypernym, TARGET_VAR), + (v[2], dbo['wikiPageRedirects'], SOURCE_VAR) +]) + + +def main(): + ground_truth_pairs = get_semantic_associations() + ground_truth_pairs, _ = split_training_test_set(ground_truth_pairs) + # ground_truth_pairs = ground_truth_pairs[:100] + gtp_scores = GTPScores(ground_truth_pairs) + res = [] + for i in range(100): + key = random.choice(gp_found.keys()) + gp_ = gp_found[key] + # eval_gp(gtp_scores, gp_) + r = mutate_deep_narrow_path(sparql, timeout, gtp_scores, gp_) + logger.info(i) + logger.info(r) + res.append(r) + + +if __name__ == '__main__': + main() diff --git a/tests/test_sampling.py b/tests/test_sampling.py new file mode 100644 index 0000000..044449d --- /dev/null +++ b/tests/test_sampling.py @@ -0,0 +1,398 @@ +# coding=utf-8 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +"""Tested das bauen von graph_pattern per gesampeltem finden von 1-hop wegen +und fix-var-mutation +""" + +import logging +import random +from collections import defaultdict +from collections import OrderedDict +from os import getenv + +import SPARQLWrapper +from splendid import get_path +from splendid import time_func +import socket +import rdflib +from rdflib import BNode +from rdflib import Literal +from rdflib import URIRef +from rdflib import Variable + +from config import SPARQL_ENDPOINT +from gp_learner import evaluate +from gp_learner import mutate_fix_var +from gp_learner import update_individuals +from gp_query import calibrate_query_timeout +from gp_query import query_time_hard_exceeded +from gp_query import query_time_soft_exceeded +from graph_pattern import GraphPattern +from graph_pattern import SOURCE_VAR +from graph_pattern import TARGET_VAR +from ground_truth_tools import get_semantic_associations +from ground_truth_tools import split_training_test_set +from gtp_scores import GTPScores +from serialization import print_graph_pattern +from utils import sparql_json_result_bindings_to_rdflib + +logger = logging.getLogger(__name__) + +sparql = SPARQLWrapper.SPARQLWrapper(SPARQL_ENDPOINT) +# sparql = SPARQLWrapper.SPARQLWrapper( +# getenv('SPARQL_ENDPOINT', 'http://dbpedia.org/sparql')) +try: + timeout = max(5, calibrate_query_timeout(sparql)) # 5s for warmup +except IOError: + from nose import SkipTest + raise SkipTest( + "Can't establish connection to SPARQL_ENDPOINT:\n %s\n" + "Skipping tests in\n %s" % (sparql.endpoint, __file__)) + +dbp = rdflib.Namespace('http://dbpedia.org/resource/') +owl = rdflib.Namespace('http://www.w3.org/2002/07/owl#') + +a = Variable('a') +b = Variable('b') +c = Variable('c') +d = Variable('d') +e = Variable('e') +f = Variable('f') +v = Variable('v') +w = Variable('w') + +sameAs = owl['sameAs'] + +gp_1 = GraphPattern([ + (SOURCE_VAR, v, TARGET_VAR) +]) + +gp_2 = GraphPattern([ + (SOURCE_VAR, v, TARGET_VAR), + (TARGET_VAR, w, SOURCE_VAR) +]) + +gp_3 = GraphPattern([ + (SOURCE_VAR, a, b), + (b, c, d), + (d, e, TARGET_VAR) +]) + +gp_4 = GraphPattern([ + (SOURCE_VAR, a, b), + (b, c, d), + (TARGET_VAR, e, d) +]) + +gp_5 = GraphPattern([ + (SOURCE_VAR, a, c), + (TARGET_VAR, URIRef('http://dbpedia.org/ontology/thumbnail'), d), + (TARGET_VAR, URIRef('http://dbpedia.org/property/image'), b), + (c, URIRef('http://dbpedia.org/ontology/wikiPageWikiLink'), SOURCE_VAR), + (c, URIRef('http://purl.org/linguistics/gold/hypernym'), TARGET_VAR) +]) + +ground_truth_pairs_1 = [ + (dbp['Berlin'], dbp['Germany']), + (dbp['Hamburg'], dbp['Germany']), + (dbp['Kaiserslautern'], dbp['Germany']), + (dbp['Wien'], dbp['Austria']), + (dbp['Insbruck'], dbp['Austria']), + (dbp['Salzburg'], dbp['Austria']), + (dbp['Paris'], dbp['France']), + (dbp['Lyon'], dbp['France']), + (dbp['Amsterdam'], dbp['Netherlands']), + (dbp['Brussels'], dbp['Belgium']), + (dbp['Washington'], dbp['United_States']), + (dbp['Madrid'], dbp['Spain']), + (dbp['Prague'], dbp['Czech_Republic']), + (dbp['Bern'], dbp['Switzerland']), +] + +ground_truth_pairs_2 = get_semantic_associations() +ground_truth_pairs_2, _ = split_training_test_set(ground_truth_pairs_2) +ground_truth_pairs_2 = random.sample(ground_truth_pairs_2, 100) + +ground_truth_pairs_3 = [ + (dbp['Barrister'], dbp['Law']), + (dbp['Christ'], dbp['Jesus']), + (dbp['Pottage'], dbp['Soup']) + ] + +ground_truth_pairs_4 = [ + (dbp['Motorrad_(disambiguation)'], dbp['Bmw_motorcycle']), + (dbp['Horse'], dbp['Saddle']) +] + +gtp_scores_1 = GTPScores(ground_truth_pairs_1) +gtp_scores_2 = GTPScores(ground_truth_pairs_2) +gtp_scores_3 = GTPScores(ground_truth_pairs_3) +gtp_scores_4 = GTPScores(ground_truth_pairs_4) + + +def test_count(gtps, max_out): + # values = {(SOURCE_VAR, TARGET_VAR): gtps} hier besser nur die sources + source_list = [(stp[0], ) for stp in gtps] + values = {(SOURCE_VAR, ): source_list} + gp1 = GraphPattern([(SOURCE_VAR, a, b)]) + gp2 = GraphPattern([(b, c, TARGET_VAR)]) + # SPARQL-Query die über eine Var aus gp1 random samplet + q = gp1.to_sparql_filter_by_count_out_query( + values=values, count_node=b, max_out=max_out, limit=200) + logger.info(q) + t, q_res1 = run_query(q) + logger.info(q_res1) + # Kreiere b_list in der die Ergebnisse für b "gespeichert" sind + # TODO: als Methode, die Listenform (Tupellistenform) der gefundenen + # Bindings zu gewünschten Variablen zurückgibt. + res_rows_path = ['results', 'bindings'] + bind1 = sparql_json_result_bindings_to_rdflib( + get_path(q_res1, res_rows_path, default=[]) + ) + b_list = [] + for row in bind1: + x = get_path(row, [b]) + y = (x, ) + b_list.append(y) + logger.info('orig query took %.4f s, result:\n%s\n', t, b_list) + b_list[:] = [b_l for b_l in b_list if not list_remove_bool(b_l[0])] + b_list = list(set(b_list)) + # Values für die nächste query: b_list + values = {(b, ): b_list} + # Query die über eine var aus gp2 random samplet mit values aus b_list + q = gp2.to_sparql_select_sample_query(values=values, limit=5000) + logger.info(q) + try: + t, q_res2 = run_query(q) + except: + return [] + # Kreiere target_list, in der die "gefundenen" Targets vermerkt sind + bind2 = sparql_json_result_bindings_to_rdflib( + get_path(q_res2, res_rows_path, default=[]) + ) + target_list = [] + for row in bind2: + target_list.append(get_path(row, [TARGET_VAR])) + logger.info('orig query took %.4f s, result:\n%s\n', t, q_res2) + # Kreire gtps_2 in der alle gtps, deren targets in target_list enthalten + # sind, "gespeichert" werden + gtps_2 = [] + for t in target_list: + for gtp in gtps: + if t == gtp[1]: + gtps_2.append(gtp) + logger.info(gtps_2) + + # GraphPattern mit gefixten Pfaden aus den gefundenen gtp kreieren: + # TODO: Das ganze als Methode aus einem graph-pattern, den results und + # den stp + gp_list = [] + for row2 in bind2: + for gtp in gtps: + if gtp[1] == get_path(row2, [TARGET_VAR]): + for row1 in bind1: + if get_path(row1, [b]) == get_path(row2, [b]): + gp_ = GraphPattern([ + (SOURCE_VAR, get_path(row1, [a]), b), + (b, get_path(row2, [c]), TARGET_VAR) + ]) + if gp_ not in gp_list: + gp_list.append(gp_) + + # gp3 = GraphPattern([ + # (SOURCE_VAR, a, b), + # (b, c, TARGET_VAR) + # ]) + gtp_scores = GTPScores(gtps) + # gtp_scores2 = GTPScores(gtps_2) + + # # Fixe das pattern über die gefundenen gtps + # mfv2 = [] + # if len(gtps_2) > 1: + # mfv2 = mutate_fix_var(sparql, timeout, gtp_scores2, gp3) + # + # # lasse die gefundenen Pattern einmal durch die fix_var laufen + # mfv = [] + # for gp_mfv2 in mfv2: + # mfv_res = mutate_fix_var(sparql, timeout, gtp_scores, gp_mfv2) + # for gp_res in mfv_res: + # mfv.append(gp_res) + # + # # evaluiere die so gefundenen Pattern + # res_eval = eval_gp_list(gtp_scores, mfv) + # return res_eval + + # evaluiere die gefixten pattern + res_eval = eval_gp_list(gtp_scores, gp_list) + return res_eval + + +def test_sample(gtps): + values = {(SOURCE_VAR, TARGET_VAR): gtps} + gp1 = GraphPattern([(SOURCE_VAR, a, b)]) + gp2 = GraphPattern([(b, c, TARGET_VAR)]) + # SPARQL-Query die über eine Var aus gp1 random samplet. + # TODO: Query so verändern, dass nach count gefiltert wird (siehe log.txt) + q = gp1.to_sparql_select_sample_query(values=values, limit=100) + logger.info(q) + t, q_res1 = run_query(q) + logger.info(q_res1) + # Kreiere b_list in der die Ergebnisse für b "gespeichert" sind + res_rows_path = ['results', 'bindings'] + bind1 = sparql_json_result_bindings_to_rdflib( + get_path(q_res1, res_rows_path, default=[]) + ) + b_list = [] + for row in bind1: + x = get_path(row, [b]) + y = (x, ) + b_list.append(y) + logger.info('orig query took %.4f s, result:\n%s\n', t, b_list) + b_list[:] = [b_l for b_l in b_list if not list_remove_bool(b_l[0])] + # Values für die nächste query: b_list + values = {(b, ): b_list} + # Query die über eine var aus gp2 random samplet mit values aus b_list + q = gp2.to_sparql_select_sample_query(values=values, limit=5000) + logger.info(q) + t, q_res2 = run_query(q) + # Kreiere target_list, in der die "gefundenen" Targets vermerkt sind + bind2 = sparql_json_result_bindings_to_rdflib( + get_path(q_res2, res_rows_path, default=[]) + ) + target_list = [] + for row in bind2: + target_list.append(get_path(row, [TARGET_VAR])) + logger.info('orig query took %.4f s, result:\n%s\n', t, q_res2) + # Kreire gtps_2 in der alle gtps, deren targets in target_list enthalten + # sind, "gespeichert" werden + gtps_2 = [] + for t in target_list: + for gtp in gtps: + if t == gtp[1]: + gtps_2.append(gtp) + logger.info(gtps_2) + + # GraphPattern mit gefixten Pfaden aus den gefundenen gtp kreieren: + # TODO: Das ganze als Methode aus einem graph-pattern, den results und + # den stp + gp_list = [] + for row2 in bind2: + for gtp in gtps: + if gtp[1] == get_path(row2, [TARGET_VAR]): + for row1 in bind1: + if get_path(row1, [b]) == get_path(row2, [b]): + gp_ = GraphPattern([ + (SOURCE_VAR, get_path(row1, [a]), b), + (b, get_path(row2, [c]), TARGET_VAR) + ]) + if gp_ not in gp_list: + gp_list.append(gp_) + + # gp3 = GraphPattern([ + # (SOURCE_VAR, a, b), + # (b, c, TARGET_VAR) + # ]) + gtp_scores = GTPScores(gtps) + # gtp_scores2 = GTPScores(gtps_2) + + # # Fixe das pattern über die gefundenen gtps + # mfv2 = [] + # if len(gtps_2) > 1: + # mfv2 = mutate_fix_var(sparql, timeout, gtp_scores2, gp3) + # + # # lasse die gefundenen Pattern einmal durch die fix_var laufen + # mfv = [] + # for gp_mfv2 in mfv2: + # mfv_res = mutate_fix_var(sparql, timeout, gtp_scores, gp_mfv2) + # for gp_res in mfv_res: + # mfv.append(gp_res) + # + # # evaluiere die so gefundenen Pattern + # res_eval = eval_gp_list(gtp_scores, mfv) + # return res_eval + + # evaluiere die gefixten pattern + res_eval = eval_gp_list(gtp_scores, gp_list) + return res_eval + + +# Runs a given (as String) query against the Sparql-endpoint +def run_query(q): + try: + q_short = ' '.join((line.strip() for line in q.split('\n'))) + sparql.setQuery(q_short) + cal = time_func(sparql.queryAndConvert) + except socket.timeout: + cal = (timeout, {}) + except ValueError: + # e.g. if the endpoint gives us bad JSON for some unicode chars + logger.info( + 'Could not parse result for query, assuming empty result...\n' + 'Query:\n%s\nException:', q, + exc_info=1, # appends exception to message + ) + cal = (timeout, {}) + return cal + + +# Checks if an found RDF-Term can be used as value in a new query +# (without conflicts) +def list_remove_bool(var): + if isinstance(var, Literal): + i_n3 = var.n3() + if len(i_n3) > 60: + return True + elif isinstance(var, BNode): + return True + # echt hässlich, aber die einzige Möglichkeit, die ich gesehen habe um + # keine Probleme mit dem Category:Cigarettes-Beispiel zu bekommen + # (siehe docs) + # TODO: Möglicherweise dafür sorgen, dass die nicht rausgeschmissen, + # sondern nur nicht mit prefix gekürzt werden, also einfach mal schauen, + # dass die curify das tut was sie soll + elif isinstance(var, URIRef): + return ':' in var[7:] + return False + + +# evaluates a given graph-pattern-list +def eval_gp_list(gtp_scores, gp_list): + for gp_l in gp_list: + res_ev = evaluate( + sparql, timeout, gtp_scores, gp_l, run=0, gen=0) + update_individuals([gp_l], [res_ev]) + # print_graph_pattern(gp_, print_matching_node_pairs=0) + return gp_list + + +if __name__ == '__main__': + # # test_sample: + # res = [] + # for i in range(10): + # res_ts = test_sample(ground_truth_pairs_2) + # for gp_ts in res_ts: + # res.append(gp_ts) + # + # res = sorted(res, key=lambda gp_: -gp_.fitness.values.score) + # for res_ in res: + # print_graph_pattern(res_) + + # test_count + res = [] + for i in range(1): + ground_truth_pairs_5 = get_semantic_associations() + ground_truth_pairs_5 = random.sample(ground_truth_pairs_5, 200) + max_out_steps = [10, 15, 20, 25, 30, 40, 50, 75, 100] + for j in max_out_steps: + res_ts = test_count(ground_truth_pairs_5, j) + for gp_ts in res_ts: + res.append((gp_ts, j)) + + res = sorted(res, key=lambda gp_: -gp_[0].fitness.values.score) + res = res[0:100] + for res_ in res: + print('max_out:'+str(res_[1])) + print_graph_pattern(res_[0])