-
Notifications
You must be signed in to change notification settings - Fork 17
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
21 changed files
with
2,068 additions
and
31 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
Detection Rules | ||
--------------- | ||
|
||
Check the filename | ||
|
||
"li[cs]en[cs]e(s?)", | ||
"legal", | ||
"copy(left|right|ing)", | ||
"unlicense", | ||
"l?gpl([-_ v]?)(\\d\\.?\\d)?", | ||
"bsd", | ||
"mit", | ||
"apache", | ||
|
||
If it falls into one of the above, its highly likely it is a licence, | ||
and should be tested. Note that the name itself highly indicates the | ||
licence itself, with unlicense for example indicating it is the unlicnse. | ||
|
||
Something like licence, legal, or copy(left|right|ing) needs to be checked | ||
because while it is highly likely to have a licence we cannot be sure | ||
as to which licence it actually is. Its also possible that these examples | ||
could have multiple licenses in them. Example github.com/valkey/valkey/COPYING | ||
|
||
"", | ||
".md", | ||
".rst", | ||
".html", | ||
".txt", | ||
|
||
Where the file matchs the above patterns, where it has has no extention or | ||
one of the others we should inspect it to see if it has a license. Its possible | ||
a licence exists here, but we cannot be sure. Note that its possible there are multiple | ||
licences in the file which needs to be dealt with. | ||
|
||
// SPDX-License-Identifier: MIT OR Unlicense | ||
|
||
For all other files, there are a few possibilities. | ||
The first is that it contains a SPDX header such as the above which indicates | ||
which license the file is under. Its also possible that the header will contain | ||
a full copy of another licence such as MIT, GPL or otherwise. Possibly inside a comment | ||
or a long string declaration in the case of code. Its possible it has multiple. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
// SPDX-License-Identifier: AGPL-3.0 | ||
|
||
package pkg | ||
|
||
import ( | ||
"fmt" | ||
"regexp" | ||
"strings" | ||
) | ||
|
||
var commonLicences = []string{"MIT", "Apache-2.0", "GPL-3.0", "AGPL-3.0", "BSD-3-Clause", "GPL-2.0", "BSD-2-Clause", "CC0-1.0", "LGPL-3.0", "LGPL-2.1", "ISC", "0BSD", "LGPL-2.0", "Unlicense", "BSD-3-Clause-No-Nuclear-License-2014", "MPL-2.0", "EPL-1.0", "MPL-2.0-no-copyleft-exception", "AGPL-1.0", "CC-BY-4.0", "IPL-1.0", "CPL-1.0", "CC-BY-3.0", "CC-BY-SA-4.0", "WTFPL", "Zlib", "CC-BY-SA-3.0", "Cube", "JSON", "BitTorrent-1.0"} | ||
|
||
// Lifted from https://github.com/go-enry/go-license-detector/blob/580c5627556917dee649cdb2b179cb42d6c56a60/licensedb/internal/investigation.go#L29 | ||
// SPDX-License-Identifier: Apache-2.0 | ||
var ( | ||
// Base names of guessable license files | ||
licenseFileNames = []string{ | ||
"li[cs]en[cs]e(s?)", | ||
"legal", | ||
"copy(left|right|ing)", | ||
"unlicense", | ||
"l?gpl([-_ v]?)(\\d\\.?\\d)?", | ||
"bsd", | ||
"mit", | ||
"apache", | ||
} | ||
|
||
// License file extensions. Combined with the fileNames slice | ||
// to create a set of files we can reasonably assume contain | ||
// licensing information. | ||
fileExtensions = []string{ | ||
"", | ||
".md", | ||
".rst", | ||
".html", | ||
".txt", | ||
} | ||
|
||
licenseFileRe = regexp.MustCompile( | ||
fmt.Sprintf("^(|.*[-_. ])(%s)(|[-_. ].*)$", | ||
strings.Join(licenseFileNames, "|"))) | ||
|
||
readmeFileRe = regexp.MustCompile(fmt.Sprintf("^(readme|guidelines)(%s)$", | ||
strings.Replace(strings.Join(fileExtensions, "|"), ".", "\\.", -1))) | ||
) | ||
|
||
func IsLicenceFile(filename string) bool { | ||
// attempt to filter out false positives that come from java due to filenames | ||
if strings.Count(filename, ".") > 2 { | ||
return false | ||
} | ||
|
||
return licenseFileRe.Match([]byte(strings.ToLower(filename))) | ||
} | ||
|
||
func IsReadmeFile(filename string) bool { | ||
return readmeFileRe.Match([]byte(strings.ToLower(filename))) | ||
} | ||
|
||
func compareOptimize(input string) string { | ||
tokens := strings.Fields(input) | ||
var sb strings.Builder | ||
skipTokens := map[string]struct{}{} | ||
for i := 0; i < len(tokens); i++ { | ||
tok := tokens[i] | ||
foundLonger := false | ||
|
||
// if we have already looked at this token, skip it, important for performance | ||
_, ok := skipTokens[tok] | ||
if ok { | ||
continue | ||
} | ||
skipTokens[tok] = struct{}{} | ||
|
||
for j := i; j < len(tokens); j++ { | ||
tok2 := tokens[j] | ||
if tok == tok2 { | ||
continue | ||
} | ||
|
||
if len(tok2) <= len(tok) { | ||
continue | ||
} | ||
|
||
if strings.Contains(tok2, tok) { | ||
foundLonger = true | ||
} | ||
} | ||
|
||
if !foundLonger { | ||
sb.WriteString(tok) | ||
} | ||
} | ||
|
||
return sb.String() | ||
} |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
// SPDX-License-Identifier: AGPL-3.0 | ||
|
||
package pkg | ||
|
||
import ( | ||
"encoding/base64" | ||
"encoding/json" | ||
"github.com/boyter/lc/pkg/levenshtein" | ||
"math" | ||
"strings" | ||
) | ||
|
||
type LicenceDetector struct { | ||
Database []License | ||
} | ||
|
||
func NewLicenceDetector() *LicenceDetector { | ||
l := new(LicenceDetector) | ||
l.LoadDatabase() | ||
return l | ||
} | ||
|
||
type License struct { | ||
LicenseTexts []string `json:"licenseTexts"` // examples of text that we have for these licences | ||
LicenseIds []string `json:"licenseIds"` // SPDX ids where licences are considered identical | ||
Keywords []string `json:"keywords"` // keywords that are unique and can be used to identify this group of licences | ||
} | ||
|
||
// LoadDatabase will initialize the database values and should only be called once such as in an init | ||
func (l *LicenceDetector) LoadDatabase() { | ||
if len(l.Database) != 0 { | ||
return | ||
} | ||
|
||
data, _ := base64.StdEncoding.DecodeString(database_keywords) | ||
_ = json.Unmarshal(data, &l.Database) | ||
} | ||
|
||
type LicenseGuess struct { | ||
Name string | ||
} | ||
|
||
func (l *LicenceDetector) Guess(filename string, content string) []LicenseGuess { | ||
if IsLicenceFile(filename) { | ||
// Check if the filename matches on of the common licences in which case return that | ||
// since it seems unlikely someone would add a file called LGPL-2.0 without | ||
// it actually being that licence | ||
for _, li := range commonLicences { | ||
if strings.EqualFold(filename, li) { | ||
return []LicenseGuess{ | ||
{ | ||
Name: li, | ||
}, | ||
} | ||
} | ||
} | ||
|
||
// at this point we are confident we have a licence file, but we don't know which one, so lets | ||
// start by firstly assuming there is only 1 license in the file | ||
// and then try to determine what is actually inside the file | ||
var bestGuess License | ||
bestMatch := math.MaxInt | ||
con := []rune(compareOptimize(content)) | ||
for _, li := range l.Database { | ||
for _, lic := range li.LicenseTexts { | ||
m := levenshtein.DistanceForStrings([]rune(compareOptimize(lic)), con, levenshtein.DefaultOptions) | ||
if m < bestMatch { | ||
bestGuess = li | ||
bestMatch = m | ||
} | ||
} | ||
} | ||
|
||
if len(bestGuess.LicenseIds) != 0 { | ||
return []LicenseGuess{ | ||
{ | ||
Name: bestGuess.LicenseIds[0], | ||
}, | ||
} | ||
} | ||
|
||
return nil | ||
} | ||
|
||
if IsReadmeFile(filename) { | ||
// at this point we are confident we have a licence file, but we don't know which one, so lets | ||
// start by firstly assuming there is only 1 license in the file | ||
// and then try to determine what is actually inside the file | ||
var bestGuess License | ||
bestMatch := math.MaxInt | ||
con := []rune(compareOptimize(content)) | ||
for _, li := range l.Database { | ||
for _, lic := range li.LicenseTexts { | ||
m := levenshtein.DistanceForStrings([]rune(compareOptimize(lic)), con, levenshtein.DefaultOptions) | ||
if m < bestMatch { | ||
bestGuess = li | ||
bestMatch = m | ||
} | ||
} | ||
} | ||
|
||
if len(bestGuess.LicenseIds) != 0 { | ||
return []LicenseGuess{ | ||
{ | ||
Name: bestGuess.LicenseIds[0], | ||
}, | ||
} | ||
} | ||
} | ||
|
||
return nil | ||
} |
Oops, something went wrong.