Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: deep narrow paths mutation #10

Open
wants to merge 27 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
8fbd1f1
Test for evaluate() and mutate_fix_var()
pneuer Jun 21, 2018
db78db4
modified test_fv_eval.py
pneuer Jun 25, 2018
07d1f39
Test to find one hop patterns with SAMPLE-Queries
pneuer Jul 10, 2018
0837c10
test finished, alg.not yet in learner
pneuer Aug 30, 2018
f67c730
Deep-and-Narrow-Path-Mutation schould be runnable
pneuer Aug 31, 2018
c908caf
kleine Aenderung und Fehlerbehebung in gp_learner.py
pneuer Sep 3, 2018
adbc215
Undone modifying unrelated stuff
pneuer Sep 4, 2018
20e5b34
Renamed two values and added, alpha beta values for the path lentght …
pneuer Sep 4, 2018
22a786e
Changed values MUTPB_DN_MAX_HOPS_ALPHA / BETA
pneuer Sep 4, 2018
91cbde0
Changed order in mutate_deep_narrow()
pneuer Sep 5, 2018
9c3238a
Renamed MUTPB_DN_AVG_LIMIT
pneuer Sep 5, 2018
6362dc8
added betadistribution for mut-length and dnp-mut only if not fixvar
pneuer Sep 5, 2018
1130da4
Code-Style changes and renamed mutate_deep_narrow to mutate_deep_narr…
pneuer Sep 5, 2018
12a95ae
Renamed useful_path_(inst_)query to deep_narrow_path_(inst_)query
pneuer Sep 5, 2018
22ca6aa
Renamed to_sparql_useful_path/_inst_query() to to_sparql_deep_narrow_…
pneuer Sep 5, 2018
126e84d
Undone the changes in requirements.txt
pneuer Sep 5, 2018
331e06f
Added default-value for max instances of hops
pneuer Sep 5, 2018
49b5c4d
Renamed the correct to_sparql_deep_narrow_path_inst_query()
pneuer Sep 5, 2018
c0617ea
Added docsring for mutate_deep_narrow_path() AND Renamed direct and c…
pneuer Sep 5, 2018
82cdacf
Renamed the correct to_sparql_depp_narrow_path_inst_query()
pneuer Sep 5, 2018
9117d05
Comments -> english
pneuer Sep 5, 2018
d792d10
Erased all unused to_sparql_*_query()
pneuer Sep 5, 2018
75bd1ea
Comments -> english
pneuer Sep 5, 2018
72f2fee
deleted test_fv_eval.py and SPARQL-query.py
pneuer Sep 5, 2018
e2e09a4
Erased everything except the test for the mutation in the learner
pneuer Sep 5, 2018
6deb0ad
Erased the use of private methods in to_sparql_deep_narrow_path_(inst…
pneuer Sep 6, 2018
05ae843
Changed the alpha/beta-valus for the path-length-distributen and the …
pneuer Sep 6, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@ venv/
# ignore py compiled etc. files
*.pyc
*.pyo

# ignore .idea
.idea/
9 changes: 8 additions & 1 deletion config/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,18 @@
MUTPB_EN_OUT_LINK = 0.5 # probability to add an outgoing triple (otherwise in)
MUTPB_AE = 0.2 # prob to try adding an edge between two nodes
MUTPB_ID = 0.05 # prob to increase distance between source and target by 1 hop
MUTPB_FV = 0.4 # prob to fix a variable (SPARQL)
MUTPB_FV = 0.25 # prob to fix a variable (SPARQL)
MUTPB_FV_RGTP_SAMPLE_N = 128 # sample <= n remaining GTPs to fix variables for
MUTPB_FV_SAMPLE_MAXN = 32 # max n of instantiations to sample from top k
MUTPB_FV_QUERY_LIMIT = 256 # SPARQL query limit for the top k instantiations
MUTPB_SP = 0.05 # prob to simplify pattern (warning: can restrict exploration)
# TODO: Lower the MUTPB_DN
MUTPB_DN = 0.6 # prob to try adding a deep and narrow path to a pattern
MUTPB_DN_MAX_HOPS = 10 # Max number of hops in the deep narrow path
MUTPB_DN_MAX_HOPS_ALPHA = 1.15 # alpha value in a length beta distribution
MUTPB_DN_MAX_HOPS_BETA = 1.85 # beta value in a length beta distribution
MUTPB_DN_AVG_DEG_LIMIT = 10 # Max avg. reachable Nodes
MUTPB_DN_MAX_HOP_INST = 10 # Max number of hop instances for the next query/ies

# fusion of target candidates:
FUSION_SAMPLES_PER_CLASS = 500 # only use up to n training samples per class
Expand Down
147 changes: 143 additions & 4 deletions gp_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@
from gp_query import query_stats
from gp_query import query_time_hard_exceeded
from gp_query import query_time_soft_exceeded
from gp_query import deep_narrow_path_query
from gp_query import deep_narrow_path_inst_query
from gp_query import variable_substitution_query
from graph_pattern import canonicalize
from graph_pattern import gen_random_var
Expand Down Expand Up @@ -685,6 +687,142 @@ def mutate_fix_var(
return res


def mutate_deep_narrow_path(
sparql,
timeout,
gtp_scores,
child,
directions=None,
child_in_queries=False,
limit=None, # TODO: Use a limit for the queries?
):
""" Finds n-hop-connections from Source to Target, to add them to a given
Graph-Pattern.

The outline of the mutation is as follows:
- If not evaluated, evaluates the given GP to work on its matching-node-
pairs
- If not passed in, randomly selects the path-length and the directions
of the single hops.
- Issues SPARQL queries, to find hops (from Source and Target), that don't
have a big fan-out (smaller than the default-value). Uses an default max-
amount of found hops to find the next hop.
When there is only one hop left to find, it tries to instanciate paths,
that fit to an STP. If such a path is found, its hops are added to the GP.
As there could be more than one path, the mutation returns a list of such
patterns.

:param directions: list of directions to use for the hops
(1: Source -> Target, -1: Target -> Source,
0 (or everything else): choose random)
:param child_in_queries: If true: add the triples of the given pattern to
the queries
:param limit: SPARQL limnit
:return: list of children in which a deep_narrow_path is added
"""
if not child.fitness.valid:
ev = evaluate(
sparql, timeout, gtp_scores, child, run=-1, gen=-1)
update_individuals([child], [ev])
gtps = child.matching_node_pairs
if not gtps:
return [child]
if directions:
n = len(directions) - 1
else:
alpha = config.MUTPB_DN_MAX_HOPS_ALPHA
beta = config.MUTPB_DN_MAX_HOPS_BETA
max_hops = config.MUTPB_DN_MAX_HOPS
# more likely to create shorter paths
# with default values the distribution is as follows:
# PDF: 1: 14 %, 2: 27 %, 3: 25 %, 4: 17 %, 5: 10 %, 6: 5 %, 7: 1.5 %, ...
# CDF: 1: 14 %, 2: 40 %, 3: 66 %, 4: 83 %, 5: 93 %, 6: 98 %, 7: 99,6 %, ...
n = int(random.betavariate(alpha, beta) * max_hops + 1)
nodes = [SOURCE_VAR] + [Variable('n%d' % i) for i in range(n)] + [TARGET_VAR]
hops = [Variable('p%d' % i) for i in range(n + 1)]
if not directions:
directions = [0 for _ in range(n + 1)]
directions = [
random.choice([-1, 1]) if d not in [-1, 1] else d for d in directions
]
gp_hops = [
# directions[i] == 1 => hop in the direction source -> target
GraphPattern([(nodes[i], hops[i], nodes[i + 1])]) if directions[i] == 1
# directions[i] == -1 => hop in the direction target -> source
else GraphPattern([(nodes[i + 1], hops[i], nodes[i])])
for i in range(n+1)
]
# queries to get the first n hops:
valueblocks_s = {}
valueblocks_t = {}
for i in range(n // 2 + 1):
if i < int(n/2):
t, q_res = deep_narrow_path_query(
sparql,
timeout,
child,
hops[i],
nodes[i+1],
valueblocks_s,
gp_hops[:i + 1],
SOURCE_VAR,
gp_in=child_in_queries,
)
if not q_res:
return [child]
valueblocks_s[hops[i]] = {
(hops[i],): random.sample(
[(q_r,) for q_r in q_res],
min(config.MUTPB_DN_MAX_HOP_INST, len(q_res))
)
}
if n-i > i:
t, q_res = deep_narrow_path_query(
sparql,
timeout,
child,
hops[n-i],
nodes[n-i],
valueblocks_t,
gp_hops[n - i:],
TARGET_VAR,
gp_in=child_in_queries,
)
if not q_res:
return [child]
valueblocks_t[hops[n-i]] = {
(hops[n-i],): random.sample(
[(q_r,) for q_r in q_res],
min(config.MUTPB_DN_MAX_HOP_INST, len(q_res))
)
}

# query to get the last hop and instantiations, that connect source and
# target
valueblocks = {}
valueblocks.update(valueblocks_s)
valueblocks.update(valueblocks_t)
t, q_res = deep_narrow_path_inst_query(
sparql,
timeout,
child,
hops,
valueblocks,
gp_hops,
gp_in=child_in_queries
)
if not q_res:
return [child]
res = [
child + GraphPattern([
(nodes[i], qr[i], nodes[i + 1]) if directions[i] == 1
else (nodes[i + 1], qr[i], nodes[i])
for i in range(n + 1)
]) for qr in q_res
]
return res


def mutate_simplify_pattern(gp):
if len(gp) < 2:
return gp
Expand Down Expand Up @@ -797,6 +935,7 @@ def mutate(
pb_mv=config.MUTPB_MV,
pb_sp=config.MUTPB_SP,
pb_sv=config.MUTPB_SV,
pb_dn=config.MUTPB_DN,
):
# mutate patterns:
# grow: select random identifier and convert them into a var (local)
Expand Down Expand Up @@ -835,10 +974,10 @@ def mutate(
child = canonicalize(child)
children = mutate_fix_var(sparql, timeout, gtp_scores, child)
else:
children = [child]


# TODO: deep & narrow paths mutation
if random.random() < pb_dn:
children = mutate_deep_narrow_path(sparql, timeout, gtp_scores, child)
else:
children = [child]

children = {
c if fit_to_live(c) else orig_child
Expand Down
140 changes: 140 additions & 0 deletions gp_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ def __init__(self):
self.ask_multi_query_count = 0
self.combined_ask_count_multi_query_count = 0
self.variable_substitution_query_count = 0
self.useful_path_query_count = 0
self.useful_path_inst_query_count = 0
self.predict_query_count = 0
self.count_query_count = 0

Expand Down Expand Up @@ -695,6 +697,144 @@ def _var_subst_chunk_result_ext(q_res, _sel_var_and_vars, _, **kwds):

def _var_subst_res_update(res, update, **_):
res += update


def deep_narrow_path_query(
sparql,
timeout,
graph_pattern,
var_to_fix,
var_to_count,
valueblocks,
steps,
startvar,
avglimit=config.MUTPB_DN_AVG_DEG_LIMIT,
gp_in=False,
batch_size=None
):
_query_stats.useful_path_query_count += 1
# TODO: maybe batch_size = batch_size - 10 * number of valueblocks for hops
_values = graph_pattern.matching_node_pairs
# TODO: maybe use not good covered stp
_ret_val_mapping = {stp: [stp] for stp in graph_pattern.matching_node_pairs}
_vars_steps_and_stuff = (
var_to_fix, var_to_count, startvar, valueblocks, steps, avglimit, gp_in
)
return _multi_query(
sparql, timeout, graph_pattern, graph_pattern.matching_node_pairs,
batch_size, _vars_steps_and_stuff, _values, _ret_val_mapping,
_deep_narrow_path_res_init, _deep_narrow_path_chunk_q,
_deep_narrow_path_chunk_result_ext, _deep_narrow_path_res_update
)


# noinspection PyUnusedLocal
def _deep_narrow_path_res_init(_, **kwds):
return []


def _deep_narrow_path_chunk_q(gp, _vars_steps_and_stuff, values_chunk):
var_to_fix, var_to_count, startvar, _valueblocks, steps, avglimit, gp_in \
= _vars_steps_and_stuff
valueblocks = {
startvar: {
(startvar,):
[(tup[0],) for tup in values_chunk] if startvar == SOURCE_VAR
else [(tup[1],) for tup in values_chunk]
}
}
valueblocks.update(_valueblocks)
return gp.to_sparql_deep_narrow_path_query(
var_to_fix,
var_to_count,
valueblocks,
steps,
startvar,
avglimit=avglimit,
gp_in=gp_in
)


# noinspection PyUnusedLocal
def _deep_narrow_path_chunk_result_ext(q_res, _vars_steps_and_stuff, _, **kwds):
var_to_fix, var_to_count, startvar, _valueblocks, steps, avglimit, gp_in \
= _vars_steps_and_stuff
chunk_res = []
res_rows_path = ['results', 'bindings']
bindings = sparql_json_result_bindings_to_rdflib(
get_path(q_res, res_rows_path, default=[])
)
for row in bindings:
# TODO: Maybe return the avg-degree too
chunk_res.append(get_path(row, [var_to_fix]))
return chunk_res


def _deep_narrow_path_res_update(res, update, **_):
res += update


def deep_narrow_path_inst_query(
sparql,
timeout,
graph_pattern,
hop,
valueblocks,
steps,
gp_in=False,
batch_size=None
):
_query_stats.useful_path_inst_query_count += 1
# TODO: maybe batch_size = batch_size - 10 * number of valueblocks for hops
_values = graph_pattern.matching_node_pairs
# TODO: maybe use not good covered stp
_ret_val_mapping = {stp: [stp] for stp in graph_pattern.matching_node_pairs}
_vars_steps_and_stuff = (hop, valueblocks, steps, gp_in)
return _multi_query(
sparql, timeout, graph_pattern, graph_pattern.matching_node_pairs,
batch_size, _vars_steps_and_stuff, _values, _ret_val_mapping,
_deep_narrow_path_inst_res_init, _deep_narrow_path_inst_chunk_q,
_deep_narrow_path_inst_chunk_result_ext,
_deep_narrow_path_inst_res_update
)


# noinspection PyUnusedLocal
def _deep_narrow_path_inst_res_init(_, **kwds):
return []


def _deep_narrow_path_inst_chunk_q(gp, _vars_steps_and_stuff, values_chunk):
hop, _valueblocks, steps, gp_in = _vars_steps_and_stuff
valueblocks = {
'st': {
(SOURCE_VAR, TARGET_VAR): values_chunk
}
}
valueblocks.update(_valueblocks)
return gp.to_sparql_deep_narrow_path_inst_query(
hop, valueblocks, steps, gp_in=gp_in
)


# noinspection PyUnusedLocal
def _deep_narrow_path_inst_chunk_result_ext(
q_res, _vars_steps_and_stuff, _, **kwds
):
hop, _valueblocks, steps, gp_in = _vars_steps_and_stuff
chunk_res = []
res_rows_path = ['results', 'bindings']
bindings = sparql_json_result_bindings_to_rdflib(
get_path(q_res, res_rows_path, default=[])
)

for row in bindings:
chunk_res.append([get_path(row, [h]) for h in hop])
return chunk_res


def _deep_narrow_path_inst_res_update(res, update, **_):
res += update


def generate_stps_from_gp(sparql, gp):
Expand Down
Loading