-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathReadme.html
330 lines (267 loc) · 18.8 KB
/
Readme.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
<!DOCTYPE html>
<html>
<head>
<title>Readme</title>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<link rel="stylesheet" href="file:////home/boris/.atom/packages/markdown-preview-enhanced/node_modules/@shd101wyy/mume/dependencies/katex/katex.min.css">
<style>
/**
* prism.js Github theme based on GitHub's theme.
* @author Sam Clarke
*/
code[class*="language-"],
pre[class*="language-"] {
color: #333;
background: none;
font-family: Consolas, "Liberation Mono", Menlo, Courier, monospace;
text-align: left;
white-space: pre;
word-spacing: normal;
word-break: normal;
word-wrap: normal;
line-height: 1.4;
-moz-tab-size: 8;
-o-tab-size: 8;
tab-size: 8;
-webkit-hyphens: none;
-moz-hyphens: none;
-ms-hyphens: none;
hyphens: none;
}
/* Code blocks */
pre[class*="language-"] {
padding: .8em;
overflow: auto;
/* border: 1px solid #ddd; */
border-radius: 3px;
/* background: #fff; */
background: #f5f5f5;
}
/* Inline code */
:not(pre) > code[class*="language-"] {
padding: .1em;
border-radius: .3em;
white-space: normal;
background: #f5f5f5;
}
.token.comment,
.token.blockquote {
color: #969896;
}
.token.cdata {
color: #183691;
}
.token.doctype,
.token.punctuation,
.token.variable,
.token.macro.property {
color: #333;
}
.token.operator,
.token.important,
.token.keyword,
.token.rule,
.token.builtin {
color: #a71d5d;
}
.token.string,
.token.url,
.token.regex,
.token.attr-value {
color: #183691;
}
.token.property,
.token.number,
.token.boolean,
.token.entity,
.token.atrule,
.token.constant,
.token.symbol,
.token.command,
.token.code {
color: #0086b3;
}
.token.tag,
.token.selector,
.token.prolog {
color: #63a35c;
}
.token.function,
.token.namespace,
.token.pseudo-element,
.token.class,
.token.class-name,
.token.pseudo-class,
.token.id,
.token.url-reference .token.variable,
.token.attr-name {
color: #795da3;
}
.token.entity {
cursor: help;
}
.token.title,
.token.title .token.punctuation {
font-weight: bold;
color: #1d3e81;
}
.token.list {
color: #ed6a43;
}
.token.inserted {
background-color: #eaffea;
color: #55a532;
}
.token.deleted {
background-color: #ffecec;
color: #bd2c00;
}
.token.bold {
font-weight: bold;
}
.token.italic {
font-style: italic;
}
/* JSON */
.language-json .token.property {
color: #183691;
}
.language-markup .token.tag .token.punctuation {
color: #333;
}
/* CSS */
code.language-css,
.language-css .token.function {
color: #0086b3;
}
/* YAML */
.language-yaml .token.atrule {
color: #63a35c;
}
code.language-yaml {
color: #183691;
}
/* Ruby */
.language-ruby .token.function {
color: #333;
}
/* Markdown */
.language-markdown .token.url {
color: #795da3;
}
/* Makefile */
.language-makefile .token.symbol {
color: #795da3;
}
.language-makefile .token.variable {
color: #183691;
}
.language-makefile .token.builtin {
color: #0086b3;
}
/* Bash */
.language-bash .token.keyword {
color: #0086b3;
}html body{font-family:"Helvetica Neue",Helvetica,"Segoe UI",Arial,freesans,sans-serif;font-size:16px;line-height:1.6;color:#333;background-color:#fff;overflow:initial;box-sizing:border-box;word-wrap:break-word}html body>:first-child{margin-top:0}html body h1,html body h2,html body h3,html body h4,html body h5,html body h6{line-height:1.2;margin-top:1em;margin-bottom:16px;color:#000}html body h1{font-size:2.25em;font-weight:300;padding-bottom:.3em}html body h2{font-size:1.75em;font-weight:400;padding-bottom:.3em}html body h3{font-size:1.5em;font-weight:500}html body h4{font-size:1.25em;font-weight:600}html body h5{font-size:1.1em;font-weight:600}html body h6{font-size:1em;font-weight:600}html body h1,html body h2,html body h3,html body h4,html body h5{font-weight:600}html body h5{font-size:1em}html body h6{color:#5c5c5c}html body strong{color:#000}html body del{color:#5c5c5c}html body a:not([href]){color:inherit;text-decoration:none}html body a{color:#08c;text-decoration:none}html body a:hover{color:#00a3f5;text-decoration:none}html body img{max-width:100%}html body>p{margin-top:0;margin-bottom:16px;word-wrap:break-word}html body>ul,html body>ol{margin-bottom:16px}html body ul,html body ol{padding-left:2em}html body ul.no-list,html body ol.no-list{padding:0;list-style-type:none}html body ul ul,html body ul ol,html body ol ol,html body ol ul{margin-top:0;margin-bottom:0}html body li{margin-bottom:0}html body li.task-list-item{list-style:none}html body li>p{margin-top:0;margin-bottom:0}html body .task-list-item-checkbox{margin:0 .2em .25em -1.8em;vertical-align:middle}html body .task-list-item-checkbox:hover{cursor:pointer}html body blockquote{margin:16px 0;font-size:inherit;padding:0 15px;color:#5c5c5c;border-left:4px solid #d6d6d6}html body blockquote>:first-child{margin-top:0}html body blockquote>:last-child{margin-bottom:0}html body hr{height:4px;margin:32px 0;background-color:#d6d6d6;border:0 none}html body table{margin:10px 0 15px 0;border-collapse:collapse;border-spacing:0;display:block;width:100%;overflow:auto;word-break:normal;word-break:keep-all}html body table th{font-weight:bold;color:#000}html body table td,html body table th{border:1px solid #d6d6d6;padding:6px 13px}html body dl{padding:0}html body dl dt{padding:0;margin-top:16px;font-size:1em;font-style:italic;font-weight:bold}html body dl dd{padding:0 16px;margin-bottom:16px}html body code{font-family:Menlo,Monaco,Consolas,'Courier New',monospace;font-size:.85em !important;color:#000;background-color:#f0f0f0;border-radius:3px;padding:.2em 0}html body code::before,html body code::after{letter-spacing:-0.2em;content:"\00a0"}html body pre>code{padding:0;margin:0;font-size:.85em !important;word-break:normal;white-space:pre;background:transparent;border:0}html body .highlight{margin-bottom:16px}html body .highlight pre,html body pre{padding:1em;overflow:auto;font-size:.85em !important;line-height:1.45;border:#d6d6d6;border-radius:3px}html body .highlight pre{margin-bottom:0;word-break:normal}html body pre code,html body pre tt{display:inline;max-width:initial;padding:0;margin:0;overflow:initial;line-height:inherit;word-wrap:normal;background-color:transparent;border:0}html body pre code:before,html body pre tt:before,html body pre code:after,html body pre tt:after{content:normal}html body p,html body blockquote,html body ul,html body ol,html body dl,html body pre{margin-top:0;margin-bottom:16px}html body kbd{color:#000;border:1px solid #d6d6d6;border-bottom:2px solid #c7c7c7;padding:2px 4px;background-color:#f0f0f0;border-radius:3px}@media print{html body{background-color:#fff}html body h1,html body h2,html body h3,html body h4,html body h5,html body h6{color:#000;page-break-after:avoid}html body blockquote{color:#5c5c5c}html body pre{page-break-inside:avoid}html body table{display:table}html body img{display:block;max-width:100%;max-height:100%}html body pre,html body code{word-wrap:break-word;white-space:pre}}.markdown-preview{width:100%;height:100%;box-sizing:border-box}.markdown-preview .pagebreak,.markdown-preview .newpage{page-break-before:always}.markdown-preview pre.line-numbers{position:relative;padding-left:3.8em;counter-reset:linenumber}.markdown-preview pre.line-numbers>code{position:relative}.markdown-preview pre.line-numbers .line-numbers-rows{position:absolute;pointer-events:none;top:1em;font-size:100%;left:0;width:3em;letter-spacing:-1px;border-right:1px solid #999;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none}.markdown-preview pre.line-numbers .line-numbers-rows>span{pointer-events:none;display:block;counter-increment:linenumber}.markdown-preview pre.line-numbers .line-numbers-rows>span:before{content:counter(linenumber);color:#999;display:block;padding-right:.8em;text-align:right}.markdown-preview .mathjax-exps .MathJax_Display{text-align:center !important}.markdown-preview:not([for="preview"]) .code-chunk .btn-group{display:none}.markdown-preview:not([for="preview"]) .code-chunk .status{display:none}.markdown-preview:not([for="preview"]) .code-chunk .output-div{margin-bottom:16px}.scrollbar-style::-webkit-scrollbar{width:8px}.scrollbar-style::-webkit-scrollbar-track{border-radius:10px;background-color:transparent}.scrollbar-style::-webkit-scrollbar-thumb{border-radius:5px;background-color:rgba(150,150,150,0.66);border:4px solid rgba(150,150,150,0.66);background-clip:content-box}html body[for="html-export"]:not([data-presentation-mode]){position:relative;width:100%;height:100%;top:0;left:0;margin:0;padding:0;overflow:auto}html body[for="html-export"]:not([data-presentation-mode]) .markdown-preview{position:relative;top:0}@media screen and (min-width:914px){html body[for="html-export"]:not([data-presentation-mode]) .markdown-preview{padding:2em calc(50% - 457px)}}@media screen and (max-width:914px){html body[for="html-export"]:not([data-presentation-mode]) .markdown-preview{padding:2em}}@media screen and (max-width:450px){html body[for="html-export"]:not([data-presentation-mode]) .markdown-preview{font-size:14px !important;padding:1em}}@media print{html body[for="html-export"]:not([data-presentation-mode]) #sidebar-toc-btn{display:none}}html body[for="html-export"]:not([data-presentation-mode]) #sidebar-toc-btn{position:fixed;bottom:8px;left:8px;font-size:28px;cursor:pointer;color:inherit;z-index:99;width:32px;text-align:center;opacity:.4}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] #sidebar-toc-btn{opacity:1}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc{position:fixed;top:0;left:0;width:300px;height:100%;padding:32px 0 48px 0;font-size:14px;box-shadow:0 0 4px rgba(150,150,150,0.33);box-sizing:border-box;overflow:auto;background-color:inherit}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc::-webkit-scrollbar{width:8px}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc::-webkit-scrollbar-track{border-radius:10px;background-color:transparent}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc::-webkit-scrollbar-thumb{border-radius:5px;background-color:rgba(150,150,150,0.66);border:4px solid rgba(150,150,150,0.66);background-clip:content-box}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc a{text-decoration:none}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc ul{padding:0 1.6em;margin-top:.8em}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc li{margin-bottom:.8em}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .md-sidebar-toc ul{list-style-type:none}html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .markdown-preview{left:300px;width:calc(100% - 300px);padding:2em calc(50% - 457px - 150px);margin:0;box-sizing:border-box}@media screen and (max-width:1274px){html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .markdown-preview{padding:2em}}@media screen and (max-width:450px){html body[for="html-export"]:not([data-presentation-mode])[html-show-sidebar-toc] .markdown-preview{width:100%}}html body[for="html-export"]:not([data-presentation-mode]):not([html-show-sidebar-toc]) .markdown-preview{left:50%;transform:translateX(-50%)}html body[for="html-export"]:not([data-presentation-mode]):not([html-show-sidebar-toc]) .md-sidebar-toc{display:none}
/* Please visit the URL below for more information: */
/* https://shd101wyy.github.io/markdown-preview-enhanced/#/customize-css */
</style>
</head>
<body for="html-export">
<div class="mume markdown-preview ">
<p>The published web site for this repository is at <a href="https://borisbaldassari.github.io/data-anonymiser/">https://borisbaldassari.github.io/data-anonymiser/</a>.</p>
<blockquote>
<p><strong>Announce: Reviews are needed! Help us make the world a safer place!</strong><br>
We aim to provide several software-related data sets, which include personal data from developers and contributors. We need people to review this anonymisation module to make sure the anonymisation process is safe, both privacy-wise and research-wise (i.e. anonymised data are still useful). Please try these functions, use them, abuse them, and <a href="https://github.com/borisbaldassari/data-anonymiser/issues">contact us</a> if you have any concern or question.</p>
</blockquote>
<h1 class="mume-header" id="anonymise-utilities">Anonymise utilities</h1>
<p>This Perl module provides utilities to anonymise data, in the context of dataset generation. The module provides several functions to anonymise different types of data (e.g. email addresses, names..), either by a one-way function (anonymised data cannot be recovered) or two-ways functions (anonymised data can be recovered using the private key). In all cases the anonymisation mechanism preserves unicity of transformation, which means that several occurrences of the same id will be translated to the same scrambled string. This enables research analysis and studies while preserving privacy within the dataset.</p>
<p>From the technical standpoint the idea basically is to encrypt the data with a private/public key system -- in this case, RSA -- and then throw away the keys. It has the following properties:</p>
<ul>
<li>Retrieving the original data is close to impossible, according to current cryptographic systems. One can still, for debugging purposes, get the original data back if the key is saved. For production or final releases the key should not be saved anywhere to ensure privacy.</li>
<li>It still preserves one-to-one relationship between original data and encrypted data, with as little collisions as possible. This is especially useful for research analysis, where one can detect identical items without knowing the protected data.</li>
</ul>
<p>Note about one-way shortened srambling functions: it is ok to truncate hashes, and encoding base64 makes the collision risk a bit lower. For a good explanation see <a href="https://stackoverflow.com/questions/4567089/hash-function-that-produces-short-hashes">hash function that produces short hashes</a> on StackOverflow.</p>
<h2 class="mume-header" id="module-description">Module description</h2>
<p>Canonical example:</p>
<pre data-role="codeBlock" data-info="" class="language-"><code>use Anonymise::Utilities;
my $utils = Anonymise::Utilities->new();
$utils->create_keys();
# Get a 16 chars scrambled string
my $scrambled = $utils->scramble_string("blablabla");
# Get a full-size binary-encoded string
my $scrambled = $utils->encode_string($str)("blablabla");
# Get a full-size base64-encoded string
my $scrambled = $utils->encode_string_base64("blablabla");
</code></pre><p>The Perl module offers the following functions:</p>
<ul>
<li>
<p><strong>scramble_string($str)</strong> Returns a 16 chars anonymised string. Several runs on the same input string will always return the same scrambled string.</p>
</li>
<li>
<p><strong>scramble_email($str)</strong> Encode an email address with the generated public/private key and returns a set of 2 strings, each truncated to 16 chars. Output is base64-encoded. Note that this function is one-way: it is not possible to retrieve the plain text string from the truncated output.</p>
</li>
<li>
<p><strong>encode_string($str)</strong> Returns a full-size (512 chars) string encoded using the private key. The returned string will most likely contain binary characters. Several runs on the same input string will always return the same string.</p>
</li>
<li>
<p><strong>encode_string_base64($str)</strong> Returns a full-size (692 chars) string encoded using the private key and then encoded in base64. The returned string will not contain any binary characters. Several runs on the same input string will always return the same scrambled string.</p>
</li>
</ul>
<h2 class="mume-header" id="technical-reauirements">Technical reauirements</h2>
<p>The Perl module requires <code>Crypt::PK::RSA</code>.</p>
<h2 class="mume-header" id="functional-requirements">Functional requirements</h2>
<p>In our use case we want to be able to anonymise the following types of data:</p>
<ul>
<li>UUIDs, whatever they are.</li>
<li>Email addresses</li>
<li>Java classes (by class? or by full path?)</li>
</ul>
<p>We want to have a one-to-one relationship from original data to encrypted data, so as to identify identical entries without knowing them.</p>
<p>We want a method that prevent any reconstruction of the original data, according to the current state of research on privacy.</p>
<h2 class="mume-header" id="test-output">Test output</h2>
<p>Perl tests output:</p>
<pre data-role="codeBlock" data-info="" class="language-"><code>boris@kadath:code$ perl -I. t/00_utilities.t
ok 1 - use Anonymise::Utilities;
ok 2 - An object of class 'Anonymise::Utilities' isa 'Anonymise::Utilities'
# Create keys.
ok 3 - Public key starts with MII...
ok 4 - Public key has length 746.
ok 5 - Scrambled output for input 1 char is 16 chars long.
ok 6 - Scrambled output for input 9 char is 16 chars long.
ok 7 - Scrambled output for input 86 char is 16 chars long.
ok 8 - Scrambled output for in 471 char is 16 chars long.
ok 9 - Scrambled output for in 701 char is 16 chars long.
ok 10 - Scrambled output for in 931 char is 16 chars long.
ok 11 - Scrambled output for test case is 16 chars long.
ok 12 - Scrambled output for test case 2 is 16 chars long.
ok 13 - Scrambled output for test case 3 is 16 chars long.
ok 14 - Encoded string for input 9 chars is 512 chars long.
ok 15 - Encoded string for input 9 chars is 692 chars long.
ok 16 - Decoded string is equal to original string.
1..16
</code></pre><p>Functions output:</p>
<pre data-role="codeBlock" data-info="" class="language-"><code>* Test create_keys
MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA0sHCiS8B1x0qLaeuJfLPE49K2fw5x4cp
CoYpjYFECjuxNSEMUA5RlEBD37nesMK6ftEA3dVmtDtroA3EHG5NrbVqXCFKBAksdaLpDGPaemsA
rMKo+D1rNQem6jF/aBPbUydkqXYtsdNe9LQ2NJGPTbxZByMSW9R/YlPLOUO1xW7arh63zMed2W5u
SJFyBroquRw9vJVRhVYDmNJOzL5o7OClOEPPWgv9Riz4/sCWvgX5vEFuCBT8PsXUNmCe8QUE/wzs
Y+BjnQntyrdsMPPVcRhX6wagdHPgcq8A2lnHnLtwMewfBWT38BiACp5scPHOu+hyB76YJsIMNJtC
25yc/QIDAQAB
* Test scramble_string => 16 chars
XMOx+PZLA15tKvEB
* Test encode_string => 256 chars
[SNIP -- github-flavoured markdown doesn't like weird chars]
* Test encode_string_base64 => 348 chars
bM0ekgWEYIroYPioZ7qZO2iR5L0dfPKfd6pptwHXc+qBqkJd5uIG4BBzFm8I4xVeKXv3Haf/xpx4
3pRVw6FCS8kEWcfevsn8AQX1Pf37eMgLxVP3t0CZ4dPHmNB28AMAJaKr5v5jII+lYrsKqeRjWQ8Z
RFKLfwyxnn9R9TPpaBL0S+QRgNIybp/m6bQ5POBOlvcHer5wtmSGbCTFMXF54ZT90DkpOD/gVGtx
YmLgfEWqKmXjjm+mXMkqPlFKHarJB0A22mA6yKAgrfEi3gjaofbZ6z3SLOR44YKM0RJFHNM5DV/p
qTp2O1pgQI8iHrm+t18vcSzt7lqkhsmBHhpmKQ==
* Test decode_string => 9 chars
BLABLABLA
</code></pre>
</div>
</body>
</html>