Add ability to view plaintext versions of PDFs

This commit is contained in:
TEC 2021-04-13 00:14:31 +08:00
parent eb3bc4cf8d
commit e881c40bc3
Signed by: tec
GPG Key ID: 779591AFDB81F06C
1 changed files with 165 additions and 0 deletions

View File

@ -1570,6 +1570,20 @@ file uploading script (which I've renamed to ~upload~).
:defer t
:config (setq screenshot-upload-fn "upload %s 2>/dev/null"))
#+end_src
**** Prettier page break lines
In some files, =^L= appears as a page break character. This isn't that visually
appealing, and Steve Purcell has been nice enough to make a package to display
these as horizontal rules.
#+begin_src emacs-lisp
(package! page-break-lines :recipe (:host github :repo "purcell/page-break-lines"))
#+end_src
#+begin_src emacs-lisp :tangle yes
(use-package! page-break-lines
:commands page-break-lines-mode
:init
(autoload 'turn-on-page-break-lines-mode "page-break-lines"))
#+end_src
** Language packages
*** LaTeX
For mathematical convenience, WIP
@ -9568,6 +9582,157 @@ priority of =mypyls=
(after! lsp-python-ms
(set-lsp-priority! 'mspyls 1))
#+end_src
** PDF
=pdf-tools= is quite nice (though =paper= is looking quite promising), however
sometimes I'm in a terminal and I still want to see the content. Additionally,
sometimes I'd like to act on the content and so would like a plaintext version.
#+begin_info
This is a candidate for a dedicated package.
Let me know if you'd like to see this.
#+end_info
Thanks to src_shell{pdftotext} we have a convenient way of performing this
conversion.
#+begin_src emacs-lisp
(defun pdf-text--update (&optional _window)
(when (eq major-mode 'pdf-text-mode)
(let* ((converted-file (expand-file-name (concat
(file-name-base buffer-file-name)
"-"
(substring (secure-hash 'sha1 (expand-file-name buffer-file-name)) 0 6)
".txt")
temporary-file-directory))
(width (number-to-string (- (min (window-width) fill-column)
(if display-line-numbers display-line-numbers-width 0))))
(width-adjusted-file (concat (file-name-sans-extension converted-file) "-w" width ".txt")))
(unless (and (file-exists-p converted-file)
(> (time-convert (file-attribute-modification-time (file-attributes converted-file)) 'integer)
(time-convert (file-attribute-modification-time (file-attributes buffer-file-name)) 'integer)))
(call-process "pdftotext" nil nil nil "-layout" "-eol" "unix" buffer-file-name converted-file))
(unless (and (file-exists-p width-adjusted-file)
(>= (time-convert (file-attribute-modification-time (file-attributes width-adjusted-file)) 'integer)
(time-convert (file-attribute-modification-time (file-attributes converted-file)) 'integer)))
(call-process "fmt" nil (list :file width-adjusted-file) nil "-w" width converted-file))
(unless (and (boundp 'pdf-text--file)
(string= pdf-text--file width-adjusted-file))
(let ((pos (when (boundp 'pdf-text--file) (pdf-text--position-info))))
(with-silent-modifications
(let ((inhibit-read-only t)
(coding-system-for-read 'utf-8))
(erase-buffer)
(insert-file-contents width-adjusted-file)
(while (re-search-forward "\n?\f\n?" nil t)
(replace-match "\n\f\n"))
(goto-char (point-min)))
(setq-local pdf-text--file width-adjusted-file))
(setq-default saved-pos pos)
(when pos (ignore-errors (pdf-text--goto-pos pos))))))))
#+end_src
Now we just need to make a mode to use this.
#+begin_src emacs-lisp
(define-derived-mode pdf-text-mode so-long-mode "PDF Text" ; so-long for the initial buffer load time
"Major mode for viewing the plaintext version of a PDF."
(set-buffer-multibyte t)
(read-only-mode t)
(add-hook 'before-save-hook (lambda () (user-error "Will not overwrite PDF with plaintext version")))
(dolist (hook '(window-configuration-change-hook
window-size-change-functions
display-line-numbers-mode-hook))
(add-hook hook 'pdf-text--update))
(pdf-text--update)
(text-mode)
(setq mode-name "PDF Text"))
#+end_src
In src_elisp{(pdf-text--update)} there's mention of position saving and
restoring. This needs to be implemented, and it's a bit difficult since the line
numbers and buffer positions are liable to change. So, instead we can try to
take note of some markers (such as the line breaks) and try to make our way to
them.
#+begin_src emacs-lisp
(defun pdf-text--position-info ()
(list :page-no (let ((current-point (point))
(page-no 0))
(save-excursion
(while (search-forward "\f" current-point t)
(setq page-no (1+ page-no))))
page-no)
:par-start (save-excursion
(forward-paragraph -1)
(forward-line 1)
(thing-at-point 'line t))
:previous-line-content (save-excursion
(forward-line -1)
(thing-at-point 'line t))))
(defun pdf-text--goto-pos (pos)
(goto-char (point-min))
(search-forward "\f" nil nil (plist-get pos :page-no))
(re-search-forward (replace-regexp-in-string " +" "[ \n]+" (regexp-quote (plist-get pos :par-start))))
(unless (string= (plist-get pos :par-start)
(plist-get pos :previous-line-content))
(re-search-forward (replace-regexp-in-string " +" "[ \n]+" (regexp-quote (plist-get pos :previous-line-content)))
(save-excursion (forward-paragraph 1) (point)))))
#+end_src
Unfortunately while in isolated testing this position restoring works well, for
some reason as it's currently used it doesn't seem to work at all.
The output can be slightly nicer without spelling errors, and with prettier page
feeds (=^L= by default).
#+begin_src emacs-lisp
(add-hook 'pdf-text-mode-hook #'spell-fu-mode-disable)
(add-hook 'pdf-text-mode-hook (lambda () (page-break-lines-mode 1)))
#+end_src
This is very nice, now we just need to associate it with =.pdf= files, and make
sure =pdf-tools= doesn't take priority.
#+begin_src emacs-lisp
(defconst pdf-text-auto-mode-alist-entry
'("\\.[pP][dD][fF]\\'" . pdf-text-mode)
"The entry to use for `auto-mode-alist'.")
(defun pdf-text-install ()
"Add a \".pdf\" associaton for all future buffers."
(interactive)
(add-to-list 'auto-mode-alist pdf-text-auto-mode-alist-entry)
(when (featurep 'pdf-tools)
(setq-default auto-mode-alist
(remove pdf-tools-auto-mode-alist-entry auto-mode-alist))
(setq-default magic-mode-alist
(remove pdf-tools-magic-mode-alist-entry magic-mode-alist))))
(defun pdf-tools-uninstall ()
"Remove the \".pdf\" associaton for all future buffers."
(interactive)
(setq-default auto-mode-alist
(remove pdf-text-auto-mode-alist-entry auto-mode-alist)))
#+end_src
Lastly, whenever Emacs is non-graphical (i.e. a TUI), we want to use this by default.
#+begin_src emacs-lisp :tangle (if (executable-find "pdftotext") "yes" "no")
(unless (display-graphic-p)
(pdf-text-install)
;; From Doom's :tools pdf (use-package! pdf-tools)
(setq-default auto-mode-alist
(remove '("\\.pdf\\'" . pdf-view-mode) auto-mode-alist))
(setq-default magic-mode-alist
(remove '("%PDF" . pdf-view-mode) magic-mode-alist))
;; I have no idea why this is needed
(map! :map pdf-text-mode-map
"<mouse-4>" (cmd! (scroll-down mouse-wheel-scroll-amount-horizontal))
"<mouse-5>" (cmd! (scroll-up mouse-wheel-scroll-amount-horizontal))))
#+end_src
** R
*** Editor Visuals
#+begin_src emacs-lisp