Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support reading more compression formats #37

Merged
merged 9 commits into from
Aug 2, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions cmd/extract.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,14 @@ func extract(cmd *cobra.Command, files []string) {
}(resultsChan)

for {
record, err := reader.ReadRecord()
if err != nil {
if err != io.EOF {
logrus.Errorf("failed to read all record content: %v", err)
return
}
record, eol, err := reader.ReadRecord()
if eol {
break
}
if err != nil {
logrus.Errorf("failed to read all record content: %v", err)
return
}

swg.Add()
go processRecord(cmd, record, &resultsChan, &swg)
Expand Down
3 changes: 2 additions & 1 deletion cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package main

import (
"os"
"runtime"

"github.com/spf13/cobra"
)
Expand All @@ -17,7 +18,7 @@ func init() {
extractCmd.Flags().Bool("host-sort", false, "Sort the extracted URLs by host")
extractCmd.Flags().Bool("hash-suffix", false, "When duplicate file names exist, the hash will be added if a duplicate file name exists. ")

verifyCmd.Flags().IntP("threads", "t", 1, "Number of threads to use for verification")
verifyCmd.Flags().IntP("threads", "t", runtime.NumCPU(), "Number of threads to use for verification")
verifyCmd.Flags().Bool("json", false, "Output results in JSON format")
}

Expand Down
166 changes: 127 additions & 39 deletions cmd/verify.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,39 @@ package main

import (
"bufio"
"io"
"net/http"
"os"
"strconv"
"strings"
"sync"
"time"

"github.com/CorentinB/warc"
"github.com/sirupsen/logrus"
"github.com/spf13/cobra"
)

func processVerifyRecord(record *warc.Record, filepath string, results chan<- result) {
var res result
res.blockDigestErrorsCount, res.blockDigestValid = verifyBlockDigest(record, filepath)
res.payloadDigestErrorsCount, res.payloadDigestValid = verifyPayloadDigest(record, filepath)
res.warcVersionValid = verifyWarcVersion(record, filepath)
results <- res
}

type result struct {
warcVersionValid bool
blockDigestErrorsCount int
blockDigestValid bool
payloadDigestErrorsCount int
payloadDigestValid bool
}

func verify(cmd *cobra.Command, files []string) {
// threads, err := strconv.Atoi(cmd.Flags().Lookup("threads").Value.String())
// if err != nil {
// logrus.Fatalf("failed to parse threads: %s", err.Error())
// }
threads, err := strconv.Atoi(cmd.Flags().Lookup("threads").Value.String())
if err != nil {
logrus.Fatalf("failed to parse threads: %s", err.Error())
}

logger := logrus.New()
if cmd.Flags().Lookup("json").Changed {
Expand All @@ -26,8 +43,33 @@ func verify(cmd *cobra.Command, files []string) {

for _, filepath := range files {
startTime := time.Now()
valid := true
valid := true // The WARC file is valid
allRecordsReaded := false // All records readed successfully
yzqzss marked this conversation as resolved.
Show resolved Hide resolved
errorsCount := 0
recordCount := 0 // Count of records processed

recordChan := make(chan *warc.Record, threads*2)
results := make(chan result, threads*2)

var processWg sync.WaitGroup
var recordReaderWg sync.WaitGroup

if !cmd.Flags().Lookup("json").Changed {
// Output the message if not in --json mode
logrus.WithFields(logrus.Fields{
"file": filepath,
"threads": threads,
}).Info("verifying")
}
for i := 0; i < threads; i++ {
processWg.Add(1)
go func() {
defer processWg.Done()
for record := range recordChan {
processVerifyRecord(record, filepath, results)
}
}()
}

f, err := os.Open(filepath)
if err != nil {
Expand All @@ -45,51 +87,84 @@ func verify(cmd *cobra.Command, files []string) {
return
}

for {
record, err := reader.ReadRecord()
if err != nil {
if err != io.EOF {
// Read records and send them to workers
recordReaderWg.Add(1)
go func() {
defer recordReaderWg.Done()
defer close(recordChan)
for {
record, eof, err := reader.ReadRecord()
if eof {
allRecordsReaded = true
break
}
if err != nil {
if record == nil {
logrus.WithFields(logrus.Fields{
"file": filepath,
}).Errorf("failed to read record: %v", err)
} else {
logrus.WithFields(logrus.Fields{
"file": filepath,
"recordId": record.Header.Get("WARC-Record-ID"),
}).Errorf("failed to read record: %v", err)
}
errorsCount++
valid = false
return
}
recordCount++

// Only process Content-Type: application/http; msgtype=response (no reason to process requests or other records)
if !strings.Contains(record.Header.Get("Content-Type"), "msgtype=response") {
logrus.WithFields(logrus.Fields{
"file": filepath,
"recordId": record.Header.Get("WARC-Record-ID"),
}).Errorf("failed to read all record content: %v", err)
return
}).Debugf("skipping record with Content-Type: %s", record.Header.Get("Content-Type"))
continue
}
break
}

// Only process Content-Type: application/http; msgtype=response (no reason to process requests or other records)
if !strings.Contains(record.Header.Get("Content-Type"), "msgtype=response") {
logrus.WithFields(logrus.Fields{
"file": filepath,
"recordId": record.Header.Get("WARC-Record-ID"),
}).Debugf("skipping record with Content-Type: %s", record.Header.Get("Content-Type"))
continue
}
// We cannot verify the validity of Payload-Digest on revisit records yet.
if record.Header.Get("WARC-Type") == "revisit" {
logrus.Debugf("skipping revisit record")
continue
}

// We cannot verify the validity of Payload-Digest on revisit records yet.
if record.Header.Get("WARC-Type") == "revisit" {
logrus.Debugf("skipping revisit record")
continue
recordChan <- record
}
}()

blockDigestErrorsCount, blockDigestValid := verifyBlockDigest(record, filepath)
errorsCount += blockDigestErrorsCount
if !blockDigestValid {
valid = false
}
// Collect results from workers

payloadDigestErrorsCount, payloadDigestValid := verifyPayloadDigest(record, filepath)
errorsCount += payloadDigestErrorsCount
if !payloadDigestValid {
valid = false
recordReaderWg.Add(1)
go func() {
defer recordReaderWg.Done()
for res := range results {
if !res.blockDigestValid {
valid = false
errorsCount += res.blockDigestErrorsCount
}
if !res.payloadDigestValid {
valid = false
errorsCount += res.payloadDigestErrorsCount
}
if !res.warcVersionValid {
valid = false
errorsCount++
}
}
}
}()

processWg.Wait()
close(results)
recordReaderWg.Wait()

logger.WithFields(logrus.Fields{
"file": filepath,
"valid": valid,
"errors": errorsCount,
"file": filepath,
"valid": valid,
"errors": errorsCount,
"count": recordCount,
"allRecordsReaded": allRecordsReaded,
}).Infof("verified in %s", time.Since(startTime))
}
}
Expand Down Expand Up @@ -238,3 +313,16 @@ func verifyBlockDigest(record *warc.Record, filepath string) (errorsCount int, v

return errorsCount, valid
}

func verifyWarcVersion(record *warc.Record, filepath string) (valid bool) {
valid = true
if record.Version != "WARC/1.0" && record.Version != "WARC/1.1" {
logrus.WithFields(logrus.Fields{
"file": filepath,
"recordId": record.Header.Get("WARC-Record-ID"),
}).Errorf("invalid WARC version: %s", record.Version)
valid = false
}

return valid
}
Loading