Skip to content

Commit

Permalink
adds more entities to SqlModel approach
Browse files Browse the repository at this point in the history
  • Loading branch information
WolfgangFahl committed Mar 16, 2024
1 parent 427074b commit fc47f09
Show file tree
Hide file tree
Showing 5 changed files with 125 additions and 26 deletions.
73 changes: 53 additions & 20 deletions ceurws/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
@author: wf
"""
from lodstorage.query import QueryManager
from sqlmodel import Session, create_engine
from sqlmodel import Session, create_engine, select
from ngwidgets.profiler import Profiler

class SqlDB:
Expand All @@ -24,35 +24,68 @@ def get_session(self):
class Cached:
"""
Manage cached entities.
"""

def __init__(self,clazz,sparql,sql_db,query_name: str,debug:bool=False):
def __init__(self, clazz, sparql, sql_db, query_name: str, debug:bool=False):
"""
Initializes the Manager with the given endpoint, cache name, and query name.
Args:
clazz: the type of the entities to manage
sparql: The SPARQL endpoint for queries.
query_name (str): The name of the query to execute.
"""
self.clazz=clazz
self.sparql=sparql
self.sql_db=sql_db
"""
self.clazz = clazz
self.sparql = sparql
self.sql_db = sql_db
self.query_name = query_name
self.debug=debug
self.debug = debug
# Ensure the table for the class exists
clazz.metadata.create_all(self.sql_db.engine)

def get_lod(self,qm:QueryManager):
query=qm.queriesByName[self.query_name]
self.lod=self.sparql.queryAsListOfDicts(query.query)
def fetch_or_query(self, qm: QueryManager):
"""
Fetches data from the local cache if available; otherwise, queries via SPARQL and caches the results.
"""
if self.check_local_cache():
self.fetch_from_local()
else:
self.get_lod(qm)
self.store()

def check_local_cache(self) -> bool:
"""
Checks if there is data in the local cache (SQL database).
"""
with self.sql_db.get_session() as session:
result = session.exec(select(self.clazz)).first()
return result is not None

def fetch_from_local(self):
"""
Fetches data from the local SQL database.
"""
profiler = Profiler(f"fetch {self.query_name} from local", profile=self.debug)
with self.sql_db.get_session() as session:
self.entities = session.exec(select(self.clazz)).all()
self.lod = [entity.dict() for entity in self.entities]
if self.debug:
print(f"Loaded {len(self.entities)} records from local cache")
profiler.time()

def get_lod(self, qm: QueryManager):
"""
Fetches data using the SPARQL query.
"""
query = qm.queriesByName[self.query_name]
self.lod = self.sparql.queryAsListOfDicts(query.query)
if self.debug:
print(f"found {len(self.lod)} records for {self.query_name}")
print(f"Found {len(self.lod)} records for {self.query_name}")

def store(self):
profiler=Profiler(self.query_name,profile=self.debug)
"""
Stores the fetched data into the local SQL database.
"""
profiler = Profiler(f"store {self.query_name}", profile=self.debug)
self.entities = [self.clazz.parse_obj(record) for record in self.lod]
with self.sql_db.get_session() as session:
session.add_all(self.entities)
session.commit()
profiler.time()
session.commit()
if self.debug:
print(f"Stored {len(self.entities)} records in local cache")
profiler.time()
19 changes: 19 additions & 0 deletions ceurws/models/dblp2.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,23 @@ class Paper(SQLModel, table=True):
volume_number: str = Field(index=True)
title: str
pdf_url: Optional[str]

class Scholar(SQLModel, table=True):
"""
Represents a scholar with information fetched from DBLP and possibly other sources.
"""
dblp_author_id: str = Field(primary_key=True)
label: Optional[str] = None
wikidata_id: Optional[str] = None
orcid_id: Optional[str] = None
gnd_id: Optional[str] = None

class Proceeding(SQLModel, table=True):
"""
A proceeding indexed in DBLP with additional details.
"""
dblp_publication_id: Optional[str]
volume_number: int = Field(primary_key=True)
title: str
dblp_event_id: Optional[str] = None

38 changes: 38 additions & 0 deletions ceurws/resources/queries/dblp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -157,3 +157,41 @@
OPTIONAL{?paper dblp:documentPage ?_pdf_url}
}
GROUP BY ?proceeding ?volume_number ?paper
'CEUR-WS-Scholars':
sparql: |
PREFIX datacite: <http://purl.org/spar/datacite/>
PREFIX dblp: <https://dblp.org/rdf/schema#>
PREFIX litre: <http://purl.org/spar/literal/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT
?dblp_author_id
(SAMPLE(?_label) as ?label)
(SAMPLE(?_wikidata_id) as ?wikidata_id)
(SAMPLE(?_orcid_id) as ?orcid_id)
(SAMPLE(?_gnd_id) as ?gnd_id)
WHERE {
?proceeding dblp:publishedIn "CEUR Workshop Proceedings".
{
?proceeding dblp:editedBy ?dblp_author_id.
} UNION {
?paper dblp:publishedAsPartOf ?proceeding.
?paper dblp:authoredBy ?dblp_author_id.
}
OPTIONAL { ?dblp_author_id rdfs:label ?_label }
OPTIONAL {
?dblp_author_id datacite:hasIdentifier ?wd_blank.
?wd_blank datacite:usesIdentifierScheme datacite:wikidata.
?wd_blank litre:hasLiteralValue ?_wikidata_id.
}
OPTIONAL {
?dblp_author_id datacite:hasIdentifier ?orcid_blank.
?orcid_blank datacite:usesIdentifierScheme datacite:orcid.
?orcid_blank litre:hasLiteralValue ?_orcid_id.
}
OPTIONAL {
?dblp_author_id datacite:hasIdentifier ?gnd_blank.
?gnd_blank datacite:usesIdentifierScheme datacite:gnd.
?gnd_blank litre:hasLiteralValue ?_gnd_id.
}
}
GROUP BY ?dblp_author_id
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,9 @@ dependencies = [
# https://github.com/WolfgangFahl/nicegui_widgets
'ngwidgets>=0.12.5',
# https://pypi.org/project/wdgrid/
'wdgrid>=0.1.0'
'wdgrid>=0.1.0',
# https://pypi.org/project/sqlmodel/
'sqlmodel>=0.0.16'
]

requires-python = ">=3.9"
Expand Down
17 changes: 12 additions & 5 deletions tests/test_dblp2.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from lodstorage.sparql import SPARQL
import os
from lodstorage.query import QueryManager
from ceurws.models.dblp2 import Paper
from ceurws.models.dblp2 import Paper,Scholar, Proceeding
from ceurws.cache import Cached, SqlDB

class TestDblpCache(Basetest):
Expand All @@ -23,12 +23,19 @@ def setUp(self, debug=True, profile=True):
qYamlFile = f"{path}/ceurws/resources/queries/dblp.yaml"
if os.path.isfile(qYamlFile):
self.qm = QueryManager(lang="sparql", queriesPath=qYamlFile)
self.sql_db=SqlDB("/tmp/ceurws.db",debug=self.debug)
self.sql_db=SqlDB("/tmp/ceurws.db",debug=False)

def test_dblp_caches(self):
"""
test the dblp caches
"""
paper_cache=Cached(Paper,self.sparql,sql_db=self.sql_db,query_name="CEUR-WS-Papers",debug=self.debug)
paper_cache.get_lod(self.qm)
paper_cache.store()
caches=[
Cached(Proceeding,self.sparql,sql_db=self.sql_db,query_name="CEUR-WS all Volumes",debug=self.debug),
Cached(Scholar,self.sparql,sql_db=self.sql_db,query_name="CEUR-WS-Scholars",debug=self.debug),
Cached(Paper,self.sparql,sql_db=self.sql_db,query_name="CEUR-WS-Papers",debug=self.debug)
]
for cache in caches:
cache.fetch_or_query(self.qm)
#paper_cache.get_lod(self.qm)
#paper_cache.store()

0 comments on commit fc47f09

Please sign in to comment.