I read the thread about extracting text from PDF on Reddit. It's
interesting and I made some searches and decided to try
github.com/ledongthuc/pdf package. The following code comes from modification
of sample code of the package.
readpdftext.go |
repository |
view raw
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package readpdftext
import (
"bytes"
"github.com/ledongthuc/pdf"
)
func ReadPlainTextFromPDF ( pdfpath string ) ( text string , err error ) {
f , r , err := pdf . Open ( pdfpath )
defer f . Close ()
if err != nil {
return
}
var buf bytes . Buffer
b , err := r . GetPlainText ()
if err != nil {
return
}
buf . ReadFrom ( b )
text = buf . String ()
return
}
Usage
readpdftext_test.go |
repository |
view raw
1
2
3
4
5
6
7
8
9
10
11
12
13
14 package readpdftext
import (
"fmt"
)
func ExampleReadPlainTextFromPDF () {
content , err := ReadPlainTextFromPDF ( "test.pdf" )
if err != nil {
panic ( err )
}
fmt . Println ( content )
}
Tested on: Ubuntu Linux 18.04 , Go 1.11
References