forked from Haross/DTIM_projects_web
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnextiajd.html
executable file
·335 lines (299 loc) · 17.3 KB
/
nextiajd.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
<!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->
<meta name="description" content="">
<meta name="author" content="">
<title>NextiaJD</title>
<link rel="stylesheet" href="public/dtim.css">
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" integrity="sha384-B0vP5xmATw1+K9KRQjQERJvTumQW0nPEzvF6L/Z6nronJ3oUOFUFpCjEUQouq2+l" crossorigin="anonymous">
<!-- <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous"> -->
<script src="https://code.jquery.com/jquery-3.5.1.slim.min.js" integrity="sha384-DfXdz2htPH0lsSSs5nCTpuj/zy4C+OGpamoFVy38MVBnE+IbbVYUew+OrCXaRkfj" crossorigin="anonymous"></script>
<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js" integrity="sha384-Piv4xVNRyMGpqkS2by6br4gNJ7DXjqk09RmUpJ8jgGtD7zP9yug3goQfGII0yAns" crossorigin="anonymous"></script>
<!-- <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.12.9/umd/popper.min.js" integrity="sha384-ApNbgh9B+Y1QKtv3Rn7W3mgPxhU9K/ScQsAP7hUibX39j7fakFPskvXusvfa0b4Q" crossorigin="anonymous"></script> -->
<!-- <scri2pt src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/js/bootstrap.min.js" integrity="sha384-JZR6Spejh4U02d8jOt6vLEHfe/JQGiRRSQQxSfFWpi1MquVdAyjUar5+76PVCmYl" crossorigin="anonymous"></script> -->
<script src="https://kit.fontawesome.com/92dab46df1.js" crossorigin="anonymous"></script>
<link rel="stylesheet" href="public/styles.css">
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.7.3/html5shiv.js"></script>
<![endif]-->
</head>
<body>
<div class="container">
<nav class="navbar sticky-top navbar-expand-lg navbar-light bg-light">
<a class="navbar-brand" href="#"><img src="public/nextia_logo.png" height="30" /></a>
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarSupportedContent" aria-controls="navbarSupportedContent" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div class="collapse navbar-collapse">
<ul class="navbar-nav mr-auto">
<li class="nav-item">
<a class="nav-link" href="#people">People</a>
</li>
<li class="nav-item">
<a class="nav-link" href="#publications">Publications</a>
</li>
<li class="nav-item">
<a class="nav-link" href="#resources">Resources</a>
</li>
<li class="nav-item">
<a class="nav-link" href="#reproducibility">Reproducibility</a>
</li>
<li class="nav-item">
<a class="nav-link" href="#demonstration">Demonstration</a>
</li>
<!-- <li class="nav-item">
<a class="nav-link" href="#acknowledgements">Acknowledgements</a>
</li>
-->
</div>
</nav>
<br>
<div class="jumbotron" style="padding: 1.5em 1.5em; font-size: larger">
<h1><img src="public/nextia_logo.png" height="60" />: Scalable Data Discovery Using Profiles</h1>
<p class="text-justify">
Nextia<sub>JD</sub> is a system that supports data discovery over data lakes (i.e., large scale heterogeneous data repositories). This website is a companion of the research papers revolving around this project. Nextia<sub>JD</sub>'s novelty lies on a learning-based approach to data discovery relying on dataset profiles. These are succinct representations that capture the underlying characteristics of the schemata and data values of datasets, which can be efficiently extracted in a parallel and distributed fashion. Profiles are then compared, to predict the quality of a join operation among a pair of attributes from different datasets.
</p>
</div>
<div class="bs-docs-section" id="people">
<h1>People</h1>
<div class="members-group">
<ul class="list-unstyled members-list">
<li class="member-item">
<a href="#">
<div class="member member--card">
<div class="member-section member__photo">
<img src="https://res.cloudinary.com/dfg89e6oo/image/upload/w_1000,c_fill,ar_1:1,g_auto,r_max,b_rgb:262c35/v1555955606/afrry63rxmujltm2trk4.png" width="180" height="180">
</div>
<div class="member-section member__info">
<h5 style="text-align:center" class="member__title">Javier Flores</h5>
</div>
</div>
</a>
</li>
<li class="member-item">
<a target="_blank" href="http://www.essi.upc.edu/~snadal">
<div class="member member--card">
<div class="member-section member__photo">
<img src="https://res.cloudinary.com/dfg89e6oo/image/upload/c_thumb,f_auto,g_faces,h_360,w_360/v1439218702/ffksjjujfa7gab5lbcxb.jpg" width="180" height="180">
</div>
<div class="member-section member__info">
<h5 style="text-align:center" class="member__title">Sergi Nadal</h5>
</div>
</div>
</a>
</li>
<li class="member-item">
<a target="_blank" href="#">
<div class="member member--card">
<div class="member-section member__photo">
<img src="http://res.cloudinary.com/dfg89e6oo/image/upload/c_thumb,f_auto,g_faces/v1599038630/rcy7y8oxdlh2e1gv08ju.jpg" width="180" height="180">
</div>
<div class="member-section member__info">
<h5 style="text-align:center" class="member__title">Raquel Panadero</h5>
</div>
</div>
</a>
</li>
<li class="member-item">
<a target="_blank" href="http://www.essi.upc.edu/~oromero">
<div class="member member--card">
<div class="member-section member__photo">
<img src="https://res.cloudinary.com/dfg89e6oo/image/upload/c_thumb,f_auto,g_faces,h_360,w_360/v1445439382/gn7i1ne5vcfelalfdadh.jpg" width="180" height="180">
</div>
<div class="member-section member__info">
<h5 style="text-align:center" class="member__title">Oscar Romero</h5>
</div>
</div>
</a>
</li>
</ul>
</div>
</div>
<hr>
<div class="bs-docs-section" id="publications">
<h1>Publications</h1>
<br>
<h3>2021</h3>
<br>
<ul class="list-group">
<li class="list-group-item d-flex justify-content-between align-items-center">
<div>
Towards Scalable Data Discovery <span class="badge badge-primary">Short paper published in EDBT 2021</span>
</div>
<div>
<a href="https://openproceedings.org/2021/conf/edbt/p61.pdf"><i class="far fa-file-pdf fa-2x"></i></a>
<a href="#" data-toggle="modal" data-target="#videoModal" data-video="https://www.youtube.com/embed/u0dqqkH14JA"><i class="fab fa-youtube fa-2x"></i></a>
<a href="https://upcommons.upc.edu/handle/2117/343141"><i class="fas fa-info-circle fa-2x"></i></a>
</div>
</li>
<li class="list-group-item d-flex justify-content-between align-items-center">
<div>
Effective and scalable data discovery with NextiaJD <span class="badge badge-primary">Demo paper published in EDBT 2021</span>
</div>
<div>
<a href="https://openproceedings.org/2021/conf/edbt/p184.pdf"><i class="far fa-file-pdf fa-2x"></i></a>
<a href="#" data-toggle="modal" data-target="#videoModal" data-video="https://www.youtube.com/embed/tg5htQcZi1M"><i class="fab fa-youtube fa-2x"></i></a>
<a href="https://upcommons.upc.edu/handle/2117/343152"><i class="fas fa-info-circle fa-2x"></i></a>
</div>
</li>
</ul>
<br>
<h3>2020</h3>
<br>
<ul class="list-group">
<li class="list-group-item d-flex justify-content-between align-items-center">
<div>
An integration data tool for joinable tables based on Apache Spark <span class="badge badge-primary">Master thesis</span>
</div>
<div>
<a href="https://upcommons.upc.edu/bitstream/handle/2117/335717/152734.pdf?sequence=1&isAllowed=y"><i class="far fa-file-pdf fa-2x"></i></a>
<a href="https://upcommons.upc.edu/handle/2117/335717"><i class="fas fa-info-circle fa-2x"></i></a>
</div>
</li>
</ul>
</div>
<hr>
<div class="bs-docs-section" id="resources">
<h1>Resources</h1>
<h3>Software repository</h3>
<p class="text-justify">
The source code of the system can be found in the following <a target="_blank" href="https://github.com/dtim-upc/NextiaJD">Github repository</a>.
</p>
<p class="text-justify">The easy way to use NextiaJD is with Maven. For SBT just add the following dependency in your build.sbt</p>
<div class="alert alert-info" role="alert">
libraryDependencies += "edu.upc.essi.dtim.nextiajd" % "nextiajd_2.12" % "1.0.1"
</div>
<p>For more ways to add NextiaJD using Maven, please go <a href="https://search.maven.org/artifact/edu.upc.essi.dtim.nextiajd/nextiajd_2.12/1.0.1/jar">here</a> </p>
<p>You can check how to use NextiaJD <a href="https://github.com/dtim-upc/NextiaJD#usage">here</a> or see the zeppelin notebook with an explanation step by step, see <a href="#demonstration">demonstration section</a> </p>
<!-- <p class="text-justify">
We additionally provide the compiled JARs for Apache Spark 3.0.1 and Scala 2.12. This files must be placed (and replace if necessary) in Spark's libraries directory of the driver and all workers.
<ul>
<li><a href="https://mydisk.cs.upc.edu/s/7wKRxp3DJTgQ7yb/download" target="_blank">Spark-NextiaJD</a></li>
<li><a href="https://mydisk.cs.upc.edu/s/B36NjoYC6LTP5GQ/download" target="_blank">SparkSQL</a></li>
<li><a href="https://mydisk.cs.upc.edu/s/j6KfLkgqxtprDod/download" target="_blank">Catalyst</a></li>
</ul>
</p> -->
<h3>Ground truth datasets</h3>
<p class="text-justify">
Datasets used in this work have been obtained from open data repositories with no copyright such as <a href="https://www.kaggle.com/" target="_blank">Kaggle</a> and <a href="https://www.openml.org/" target="_blank">OpenML</a>. The datasets used to generate both our ground truth and to evaluate our method are available in the following links:
<ul>
<li><a href="https://mydisk.cs.upc.edu/s/aNbnSiSfg5xan6W/download" target="_blank">Training dataset</a></li>
<li><a href="https://mydisk.cs.upc.edu/s/eCmfrNEBSKkcWcn/download" target="_blank">Testbed XS</a> (datasets with file size 0-1MB)</li>
<li><a href="https://mydisk.cs.upc.edu/s/dX3FajwWZn7rrrd/download" target="_blank">Testbed S</a> (datasets with file size 1-100MB)</li>
<li><a href="https://mydisk.cs.upc.edu/s/niPyR4WTtxydprj/download" target="_blank">Testbed M</a> (datasets with file size 100MB-1GB)</li>
<li><a href="https://mydisk.cs.upc.edu/s/4qoi76ziT2wJaCR/download" target="_blank">Testbed L</a> (datasets with file size >1GB)</li>
</ul>
</p>
<!--
<br><br>
<h3>Online version</h3>
An online version of ODIN is available as a service via the <a href="#">following link</a>. You can interact with the demo (see next section), using the credentials (username: demo, password: demo).
-->
</div>
<hr>
<div class="bs-docs-section" id="reproducibility">
<h1>Reproducibility</h1>
<p class="text-justify">
We believe in transparent and shareable research <a target="_blank" href="https://www.acm.org/publications/reproducibility">[1]</a>, <a target="_blank" href="http://db-reproducibility.seas.harvard.edu/">[2]</a>. Hence, we provide you with <a href="https://github.com/dtim-upc/NextiaJD/tree/main/experiments" target="_blank">detailed instructions</a> on how to reproduce the experiments presented in our work:
<ul>
<li><a target="_blank" href="https://colab.research.google.com/drive/1kx3Mcv3ULwXv6NG_F8NO_vutm5nm3XU8?usp=sharing">Generalizability of our approach -- GitTables</a>
<li><a target="_blank" href="https://github.com/dtim-upc/valentine-extended">Comparison with the state-of-the-art (predictive performance) -- Valentine suite</a></li>
<li><a target="_blank" href="https://github.com/dtim-upc/NextiaJD/tree/main/experiments#experiment-2-comparison-with-the-state-of-the-art#">Comparison with the state-of-the-art (runtime performance)</a></li>
<li><a target="_blank" href="https://github.com/dtim-upc/NextiaJD/tree/main/experiments/Scalability#scalability">Scalability</a></li>
</p>
</div>
<hr>
<div class="bs-docs-section" id="demonstration">
<h1>Demonstration</h1>
<br>
<p>We provide NextiaJD in two modes of functioning: a) as a standalone Pickle ML model that can be integrated into any Python application, and b) as an Apache Spark extension.</p>
<h3>Standalone Pickle model</h3>
<p>We provide the learning model that, given a vector of profile distances, provides the predicted join quality for a pair of attributes.</p>
<p class="text-justify">In the <a href="https://github.com/dtim-upc/NextiaJD/tree/master/NextiaJDService" target="_blank">following Github repository</a>, we provide an API that wrap's NextiaJD's services so they can be used from other programming languages (e.g., Python) invoking the command via terminal. These are required to compute the profiles and their distances.</p>
<h4>ML Model</h4>
<p class="text-justify">The model can be downloaded from the <a href="https://mydisk.cs.upc.edu/s/aBHZXg9ef38K34D/download/ML_Best_model.pkl" target="_blank">following link</a> (see the <a href="https://colab.research.google.com/drive/1kx3Mcv3ULwXv6NG_F8NO_vutm5nm3XU8?usp=sharing" target="_blank">following link</a> for more details on how to use it).</p>
<h3>Apache Spark extension</h3>
<p> Live demos of NextiaJD are available as Zeppelin notebooks. Bear in mind that, <b>in order to access them you must first login</b> with the following credentials (user: <b>user1</b>, password: <b>nextiajd</b>).
</p>
<h4>Interactive GUI</h4>
<p class="text-justify">
A live demo of the user interface is available <a target="_blank" href="http://quarry.essi.upc.edu:8081/#/notebook/2G9CMU9C4">here</a>.
</p>
<h4>Library usage</h4>
<p class="text-justify">
We also provide with a code-oriented demo showcasing how proficient data analysts can take full benefit of our tool <a target="_blank" href="http://quarry.essi.upc.edu:8081/#/notebook/2FZ5HCMJQ">here</a>.
</p>
<h3>Videos</h3>
<div class="row justify-content-center">
<video width="100%" controls>
<source src="https://mydisk.cs.upc.edu/s/wa4FmPM9NxNWWb7/download/NextiaJD_demonstration.mp4" type="video/mp4">
</video>
</div>
</div>
<!-- <div class="bs-docs-section" id="acknowledgements">
<h1>Acknowledgements</h1>
<p>
TBA
</p>
</div>
-->
<!-- Modal -->
<div class="modal" id="videoModal" tabindex="-1" role="dialog">
<div class="modal-dialog modal-dialog-centered modal-lg" role="document">
<div class="modal-content">
<!-- <div class="modal-header bg-dark border-dark">
<button type="button" class="close text-white" data-dismiss="modal">×</button>
</div> -->
<div class="modal-body bg-dark p-0">
<div class="embed-responsive embed-responsive-16by9">
<iframe class="embed-responsive-item" allowfullscreen></iframe>
</div>
</div>
</div>
</div>
</div>
<!-- <div class="modal" tabindex="-1" id="videoModal">
<div class="modal-dialog">
<div class="modal-content">
<div class="modal-header">
<h5 class="modal-title">Modal title</h5>
<button type="button" class="close" data-dismiss="modal" aria-label="Close">
<span aria-hidden="true">×</span>
</button>
</div>
<div class="modal-body">
<p>Modal body text goes here.</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-secondary" data-dismiss="modal">Close</button>
<button type="button" class="btn btn-primary">Save changes</button>
</div>
</div>
</div>
</div> -->
<hr>
<p>Last update: 2023/01/12 by Sergi Nadal
</div>
<script>
$(document).ready(function() {
// Set iframe attributes when the show instance method is called
$("#videoModal").on("show.bs.modal", function(event) {
console.log("ebte");
let button = $(event.relatedTarget); // Button that triggered the modal
let url = button.data("video"); // Extract url from data-video attribute
$(this).find("iframe").attr({
src : url,
allow : "accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture"
});
});
// Remove iframe attributes when the modal has finished being hidden from the user
$("#videoModal").on("hidden.bs.modal", function() {
$("#videoModal iframe").removeAttr("src allow");
});
});
</script>
</body>
</html>