Skip to content

Commit

Permalink
Bump to 0.3.2
Browse files Browse the repository at this point in the history
  • Loading branch information
wielinde committed Sep 2, 2019
2 parents ddaf7e5 + 00053a3 commit aac988d
Show file tree
Hide file tree
Showing 7 changed files with 49 additions and 7 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [0.3.2] - 2019-09-02
- Set minimum Nokogiri version to 1.10.4. See CVE-2019-5477.
- Fix encoding issues for PDFs.

## [0.3.1] - 2019-01-16

### Added
Expand Down
34 changes: 29 additions & 5 deletions lib/plaintext/file_handler/external_command_handler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,23 @@ class ExternalCommandHandler < FileHandler
# Due to how popen works the command will be executed directly without
# involving the shell if cmd is an array.
require 'fileutils'

FILE_PLACEHOLDER = '__FILE__'.freeze
DEFAULT_STREAM_ENCODING = 'ASCII-8BIT'.freeze

def shellout(cmd, options = {}, &block)
mode = "r+"
IO.popen(cmd, mode) do |io|
io.set_encoding("ASCII-8BIT") if io.respond_to?(:set_encoding)
set_stream_encoding(io)
io.close_write unless options[:write_stdin]
block.call(io) if block_given?
end
end

FILE_PLACEHOLDER = '__FILE__'.freeze

def text(file, options = {})
cmd = @command.dup
cmd[cmd.index(FILE_PLACEHOLDER)] = Pathname(file).to_s
shellout(cmd){ |io| read io, options[:max_size] }.to_s
shellout(cmd) { |io| read io, options[:max_size] }.to_s
end


Expand All @@ -41,10 +43,32 @@ def self.available?
new.available?
end

protected

def utf8_stream?
false
end

private

def set_stream_encoding(io)
return unless io.respond_to?(:set_encoding)

if utf8_stream?
io.set_encoding('UTF-8'.freeze)
else
io.set_encoding(DEFAULT_STREAM_ENCODING)
end
end

def read(io, max_size = nil)
Plaintext::CodesetUtil.to_utf8 io.read(max_size), "ASCII-8BIT"
piece = io.read(max_size)

if utf8_stream?
piece
else
Plaintext::CodesetUtil.to_utf8 piece, DEFAULT_STREAM_ENCODING
end
end
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,16 @@ class PdfHandler < ExternalCommandHandler
DEFAULT = [
'/usr/bin/pdftotext', '-enc', 'UTF-8', '__FILE__', '-'
].freeze

def initialize
@content_type = 'application/pdf'
@command = Plaintext::Configuration['pdftotext'] || DEFAULT
end

protected

def utf8_stream?
true
end
end
end
2 changes: 1 addition & 1 deletion lib/plaintext/version.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# frozen_string_literal: true

module Plaintext
VERSION = "0.3.1"
VERSION = "0.3.2"
end
2 changes: 1 addition & 1 deletion plaintext.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ Gem::Specification.new do |spec|
spec.add_dependency 'nokogiri', '>= 1.10.4'
spec.add_dependency 'activesupport', '>2.2.1 '

spec.add_development_dependency "bundler", "~> 1.10"
spec.add_development_dependency "bundler"
spec.add_development_dependency "rake", "~> 10.0"
spec.add_development_dependency "rspec"
end
Binary file added spec/fixtures/files/text-with-umlaut.pdf
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,13 @@
expect(subject.text(file)).to match /lorem ipsum fulltext find me!/
expect(Plaintext::Resolver.new(file, 'application/pdf').text).to match /lorem ipsum fulltext find me!/
end

it 'should extract umlauts correctly into UTF-8' do
file = File.new('spec/fixtures/files/text-with-umlaut.pdf', 'r')

expect(subject.text(file)).to match /In der Küche hat es eine Kaffeemaschine/
expect(Plaintext::Resolver.new(file, 'application/pdf').text).to match /In der Küche hat es eine Kaffeemaschine/
end
else
warn "#{described_class.name} could not be tested as external program is not available."
end
Expand Down

0 comments on commit aac988d

Please sign in to comment.