Skip to content

Commit

Permalink
Support reading more compression formats (#37)
Browse files Browse the repository at this point in the history
* feat: support read zstd, zstd_with_dict, xz and raw warcs

* 20% faster reading speed

* feat: 2.5x faster verify :)

* feat: verify warc version

* make ReadRecord() returns a bool to distinguish (ab)normal io.EOF error

* raise `early EOF record boundary` error
  • Loading branch information
yzqzss authored Aug 2, 2024
1 parent 1337575 commit b52e065
Show file tree
Hide file tree
Showing 9 changed files with 397 additions and 105 deletions.
12 changes: 6 additions & 6 deletions cmd/extract.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,14 @@ func extract(cmd *cobra.Command, files []string) {
}(resultsChan)

for {
record, err := reader.ReadRecord()
if err != nil {
if err != io.EOF {
logrus.Errorf("failed to read all record content: %v", err)
return
}
record, eol, err := reader.ReadRecord()
if eol {
break
}
if err != nil {
logrus.Errorf("failed to read all record content: %v", err)
return
}

swg.Add()
go processRecord(cmd, record, &resultsChan, &swg)
Expand Down
3 changes: 2 additions & 1 deletion cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package main

import (
"os"
"runtime"

"github.com/spf13/cobra"
)
Expand All @@ -17,7 +18,7 @@ func init() {
extractCmd.Flags().Bool("host-sort", false, "Sort the extracted URLs by host")
extractCmd.Flags().Bool("hash-suffix", false, "When duplicate file names exist, the hash will be added if a duplicate file name exists. ")

verifyCmd.Flags().IntP("threads", "t", 1, "Number of threads to use for verification")
verifyCmd.Flags().IntP("threads", "t", runtime.NumCPU(), "Number of threads to use for verification")
verifyCmd.Flags().Bool("json", false, "Output results in JSON format")
}

Expand Down
166 changes: 127 additions & 39 deletions cmd/verify.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,39 @@ package main

import (
"bufio"
"io"
"net/http"
"os"
"strconv"
"strings"
"sync"
"time"

"github.com/CorentinB/warc"
"github.com/sirupsen/logrus"
"github.com/spf13/cobra"
)

func processVerifyRecord(record *warc.Record, filepath string, results chan<- result) {
var res result
res.blockDigestErrorsCount, res.blockDigestValid = verifyBlockDigest(record, filepath)
res.payloadDigestErrorsCount, res.payloadDigestValid = verifyPayloadDigest(record, filepath)
res.warcVersionValid = verifyWarcVersion(record, filepath)
results <- res
}

type result struct {
warcVersionValid bool
blockDigestErrorsCount int
blockDigestValid bool
payloadDigestErrorsCount int
payloadDigestValid bool
}

func verify(cmd *cobra.Command, files []string) {
// threads, err := strconv.Atoi(cmd.Flags().Lookup("threads").Value.String())
// if err != nil {
// logrus.Fatalf("failed to parse threads: %s", err.Error())
// }
threads, err := strconv.Atoi(cmd.Flags().Lookup("threads").Value.String())
if err != nil {
logrus.Fatalf("failed to parse threads: %s", err.Error())
}

logger := logrus.New()
if cmd.Flags().Lookup("json").Changed {
Expand All @@ -26,8 +43,33 @@ func verify(cmd *cobra.Command, files []string) {

for _, filepath := range files {
startTime := time.Now()
valid := true
valid := true // The WARC file is valid
allRecordsRead := false // All records readed successfully
errorsCount := 0
recordCount := 0 // Count of records processed

recordChan := make(chan *warc.Record, threads*2)
results := make(chan result, threads*2)

var processWg sync.WaitGroup
var recordReaderWg sync.WaitGroup

if !cmd.Flags().Lookup("json").Changed {
// Output the message if not in --json mode
logrus.WithFields(logrus.Fields{
"file": filepath,
"threads": threads,
}).Info("verifying")
}
for i := 0; i < threads; i++ {
processWg.Add(1)
go func() {
defer processWg.Done()
for record := range recordChan {
processVerifyRecord(record, filepath, results)
}
}()
}

f, err := os.Open(filepath)
if err != nil {
Expand All @@ -45,51 +87,84 @@ func verify(cmd *cobra.Command, files []string) {
return
}

for {
record, err := reader.ReadRecord()
if err != nil {
if err != io.EOF {
// Read records and send them to workers
recordReaderWg.Add(1)
go func() {
defer recordReaderWg.Done()
defer close(recordChan)
for {
record, eof, err := reader.ReadRecord()
if eof {
allRecordsRead = true
break
}
if err != nil {
if record == nil {
logrus.WithFields(logrus.Fields{
"file": filepath,
}).Errorf("failed to read record: %v", err)
} else {
logrus.WithFields(logrus.Fields{
"file": filepath,
"recordId": record.Header.Get("WARC-Record-ID"),
}).Errorf("failed to read record: %v", err)
}
errorsCount++
valid = false
return
}
recordCount++

// Only process Content-Type: application/http; msgtype=response (no reason to process requests or other records)
if !strings.Contains(record.Header.Get("Content-Type"), "msgtype=response") {
logrus.WithFields(logrus.Fields{
"file": filepath,
"recordId": record.Header.Get("WARC-Record-ID"),
}).Errorf("failed to read all record content: %v", err)
return
}).Debugf("skipping record with Content-Type: %s", record.Header.Get("Content-Type"))
continue
}
break
}

// Only process Content-Type: application/http; msgtype=response (no reason to process requests or other records)
if !strings.Contains(record.Header.Get("Content-Type"), "msgtype=response") {
logrus.WithFields(logrus.Fields{
"file": filepath,
"recordId": record.Header.Get("WARC-Record-ID"),
}).Debugf("skipping record with Content-Type: %s", record.Header.Get("Content-Type"))
continue
}
// We cannot verify the validity of Payload-Digest on revisit records yet.
if record.Header.Get("WARC-Type") == "revisit" {
logrus.Debugf("skipping revisit record")
continue
}

// We cannot verify the validity of Payload-Digest on revisit records yet.
if record.Header.Get("WARC-Type") == "revisit" {
logrus.Debugf("skipping revisit record")
continue
recordChan <- record
}
}()

blockDigestErrorsCount, blockDigestValid := verifyBlockDigest(record, filepath)
errorsCount += blockDigestErrorsCount
if !blockDigestValid {
valid = false
}
// Collect results from workers

payloadDigestErrorsCount, payloadDigestValid := verifyPayloadDigest(record, filepath)
errorsCount += payloadDigestErrorsCount
if !payloadDigestValid {
valid = false
recordReaderWg.Add(1)
go func() {
defer recordReaderWg.Done()
for res := range results {
if !res.blockDigestValid {
valid = false
errorsCount += res.blockDigestErrorsCount
}
if !res.payloadDigestValid {
valid = false
errorsCount += res.payloadDigestErrorsCount
}
if !res.warcVersionValid {
valid = false
errorsCount++
}
}
}
}()

processWg.Wait()
close(results)
recordReaderWg.Wait()

logger.WithFields(logrus.Fields{
"file": filepath,
"valid": valid,
"errors": errorsCount,
"file": filepath,
"valid": valid,
"errors": errorsCount,
"count": recordCount,
"allRecordsRead": allRecordsRead,
}).Infof("verified in %s", time.Since(startTime))
}
}
Expand Down Expand Up @@ -238,3 +313,16 @@ func verifyBlockDigest(record *warc.Record, filepath string) (errorsCount int, v

return errorsCount, valid
}

func verifyWarcVersion(record *warc.Record, filepath string) (valid bool) {
valid = true
if record.Version != "WARC/1.0" && record.Version != "WARC/1.1" {
logrus.WithFields(logrus.Fields{
"file": filepath,
"recordId": record.Header.Get("WARC-Record-ID"),
}).Errorf("invalid WARC version: %s", record.Version)
valid = false
}

return valid
}
Loading

0 comments on commit b52e065

Please sign in to comment.