SouthFox's Garden

Search IconA magnifying glass icon. 搜索

种植日期:

上次照料:

2026-04-08

Douban 元数据抓取

文献管理 的一种实践。

(defun my/set-book-metadata-by-douban (search-text)
  (interactive (list (read-string "Query: ")))
  (request
    "https://search.douban.com/book/subject_search"
    :headers (list (cons "User-Agent" "Mozilla/5.0 X11; Linux x86_64; rv:149.0) Gecko/20100101 Firefox/149.0"))
    :params `(("cat" . "1001") ("search_text" . ,search-text))
    :sync t
    :success (cl-function
	            (lambda (&key data &allow-other-keys)
                (when (string-match "window\\.__DATA__ = \\({.+?}\\);$" data)
                  (let* ((json-str (match-string 1 data))
                         (items (plist-get (json-parse-string json-str :object-type 'plist) :items))
                         (sorted-items (sort items
                                             (lambda (a b)
                                               (let ((r1 (or (thread-first a (plist-get :rating) (plist-get :value)) 0))
                                                     (r2 (or (thread-first b (plist-get :rating) (plist-get :value)) 0)))
                                                 (> r1 r2)))))
                         (candidates (mapcar
                                      (lambda (cand)
                                        (let ((rating (thread-first cand (plist-get :rating) (plist-get :value))))
                                          (propertize (string-join
                                                       (list (or (plist-get cand :title) "No Title")
                                                             (or (plist-get cand :abstract) "")
                                                             (if rating (number-to-string rating) "N/A"))
                                                       " | ")
                                                      :url (plist-get cand :url))))
                                      sorted-items))
                         (book-url (consult--read
                                    candidates
                                    :sort nil
                                    :lookup (apply-partially #'consult--lookup-prop :url)))
                         (book-info (my/douban--get-book-info book-url)))
                    (if (derived-mode-p 'org-mode)
                        (progn
                          (org-back-to-heading t)
                          (org-set-property "ISBN" (cdr (assoc "ISBN" book-info)))
                          (org-set-property "AUTHOR" (cdr (assoc "作者" book-info)))
                          (org-set-property "PUBLISH_DATE" (cdr (assoc "出版年" book-info)))
                          (org-set-property "PAGES" (cdr (assoc "页数" book-info)))
                          (org-set-property "DOUBAN_SUBJECT" book-url))
                      (prin1 (cons (cons "book-url" book-url) book-info)))
                    ))))))


(defun my/douban--get-book-info (book-url)
  (let ((html-data))
    (request
      book-url
      :headers (list (cons "User-Agent" "Mozilla/5.0 X11; Linux x86_64; rv:149.0) Gecko/20100101 Firefox/149.0"))
      :sync t
      :success (cl-function
	              (lambda (&key data &allow-other-keys)
                  (setq html-data data))))
    (let* ((info-node
            (with-temp-buffer
              (insert html-data)
              (car (dom-by-id (libxml-parse-html-region (point-min) (point-max)) "^info$"))))
           (children (dom-children info-node))
           (results '())
           (i 0))
      (while (< i (length children))
        (let* ((node (nth i children))
               (pl-node (when (listp node)
                          (if (equal (dom-attr node 'class) "pl")
                              node
                            (car (dom-by-class node "pl"))))))

          (when pl-node
            (let* ((key (string-remove-suffix ":" (string-trim (dom-texts pl-node))))
                   (full-node-text (dom-texts node))
                   (internal-val (string-trim
                                  (replace-regexp-in-string
                                   (concat "^.*" (regexp-quote key) "[ \t\n\r::]*") ""
                                   full-node-text)))
                   (value nil))
              (if (not (string-empty-p internal-val))
                  (setq value internal-val)
                (let ((next-node (nth (1+ i) children)))
                  (when next-node
                    (setq value (string-trim (if (stringp next-node) next-node (dom-texts next-node)) "[ \t\n\r::\u00a0]+"))
                    (setq i (1+ i)))))
              (when (and key (not (string-empty-p value)))
                (push (cons key value) results)))))
        (setq i (1+ i)))
      results)))