-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathweb-client.lisp
256 lines (213 loc) · 10.3 KB
/
web-client.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
;;;; web-client.lisp
(in-package #:harlie)
; There is undoubtedly a better way to extract the text from TITLE tags,
; but this is what we're stuck with for now.
(defun extract-from-html (tree anchor-p extractor)
"Recursive traversal of the nested list representation of an HTML tree.
anchor-p is a predicate which will match when we find what we're looking for.
extractor is a function which returns whatever you want from that site.
Only the first match is returned."
(declare (optimize (speed 0) (safety 3) (debug 3)))
;; If what we've been handed isn't even a list, then we aren't finding anything here.
(cond ((not (listp tree)) nil)
;; Failing that, see if we find what we're looking for right here, and if so,
;; extract and return a result.
((apply anchor-p (list tree)) (apply extractor (list tree)))
;; Otherwise, tree is still a nested list which represents some part
;; of the document we're looking at.
;; We consider each sublist of tree beginning with all of tree and
;; taking the cdr of the current sublist on each iteration.
;; Also on each iteration, we take the first element in the sublist.
;; If that element is a list, call ourselves recursively with
;; that list as the whole tree.
;; If our sublist has become NIL, or if we find a match, exit.
(t (let ((found nil))
(do* ((sublist tree (cdr sublist))
(element (car sublist) (car sublist)))
((or found (not sublist)) found)
(when (listp element)
(setf found
(extract-from-html element anchor-p extractor))))))))
(defun title-anchor (tree)
"Predicate which detects a :TITLE tag."
(and (equal (car tree) :TITLE)
(> (length tree) 2)))
(defun title-extractor (tree)
"Extract the text for a :TITLE tag."
(apply 'concatenate 'string (cddr tree)))
(defun find-title (tree)
"Search recursively for a :title tag in a nested list, and return the text."
(extract-from-html tree 'title-anchor 'title-extractor))
(defun find-forex (tree)
"Plunder a nested list for XE.com's forex information."
;; We're now looking for two SEPARATE subtrees which look like this:
;; (:SPAN ((:CLASS "amount") (:DATA-AMOUNT "2.85")) "2.85") " USD =")
;; (:SPAN ((:CLASS "uccResultAmount")) "3.76024")
;;
;; from which we want the "2.85" from (:DATA-AMOUNT "2.85")
;; and the 3.76024 from (:SPAN ((:CLASS "uccResultAmount")) "3.76024")
;;
;; O the things we do.
(list
(extract-from-html tree
#'(lambda (x) (equal (car x) :DATA-AMOUNT))
#'(lambda (y) (second y)))
(extract-from-html tree
#'(lambda (x) (and (equal (car x) :SPAN)
(listp (second x))
(listp (car (second x)))
(equal (car (car (second x))) :CLASS)
(string-equal (second (car (second x))) "uccResultAmount")))
#'(lambda (y) (third y)))))
(defparameter *user-agents* '("Mozilla/5.0 (X11; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0"
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.2 Safari/605.1.15"
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"))
(defparameter *atom-xml-namespace-url* "http://www.w3.org/2005/Atom")
(defun xml-feed-title (feed)
(xpath:string-value (let* ((k (dexador:get feed))
(ns (cxml:parse k (stp:make-builder))))
(print (xpath:string-value
(xpath:with-namespaces (("atom" *atom-xml-namespace-url*))
(xpath:evaluate "//atom:title" ns)))))))
(defun bad-suffix (url &key (suffixes *binary-url-suffixes*))
(when (mapc (lambda (suffix)
(when (cl-strings:ends-with url suffix)
(return-from bad-suffix t))
nil) suffixes))
nil)
(defun fetch-title (url)
"Extract the title from a Web page. Return three values.
If the first value is nil, then no actual title could be found. If non-nil,
then this value will be the title scraped from the page.
The second value is identical to the first if a title was found. If not, then
the second value is an informative message to be posted in IRC.
The third value is the redirected URI for the page retrieved, if the page
was retrieved. If the page couldn't be retrieved, the third value is nil.
This is a very confusing API."
(let ((page-exists-p nil)
(store-redirect-uri nil)
(user-agent (random-elt *user-agents*)))
(unless (bad-suffix url)
(handler-case
(multiple-value-bind (webtext status nonsense redirect-uri) (webget url :redirect 10 :user-agent user-agent)
(declare (ignore nonsense))
(when (and webtext status (< status 400))
(setf page-exists-p t
store-redirect-uri redirect-uri)
(if (stringp webtext)
(let* ((document (chtml:parse webtext (chtml:make-lhtml-builder)))
(title (cleanup-title (find-title document))))
(when title
(return-from fetch-title (values title title redirect-uri))))
(return-from fetch-title (values nil "Unknown data" redirect-uri)))))
(sb-int:invalid-array-index-error (c)
(log:error "Caught error: ~A" c)
(values "Remote contains invalid language encoding..."))))
(values nil (if page-exists-p "No title found" nil) store-redirect-uri)))
;;; alternative scraping system which uses an STP document structure
;;; to do a recursive search on the target document for various
;;; entities and values.
(defun webget (url &rest args)
"take an URL and get the html returned by the webserver at that
resource. Return this html to the caller as a string. In the event
of a network error from drakma, the return of this function is
NIL. Should not throw to the debugger."
(unless (member :user-agent args)
(setf args (append args (list :user-agent (car *user-agents*)))))
(multiple-value-bind
(page status headers uri stream winky nod)
(handler-case
(trivial-timeout:with-timeout (15)
(ignore-errors
(let ((flexi-streams:*substitution-char* #\?))
(apply #'drakma:http-request url
args))))
(trivial-timeout:timeout-error (c) (declare (ignore c)) nil))
(declare (ignorable page status headers uri stream winky nod))
(if (every #'identity (list page status headers uri winky nod))
(values page status headers uri winky nod)
(values nil nil))))
(defun clean-html (string)
"take a shot at repairing broken html. This won't work in the
presence of semantically undecidable breakage, but it does work for
the most common types of human fuck-up."
(chtml:parse string (chtml:make-string-sink)))
(defun parse-page (page)
"assumes page refers to a string of raw html. Returns a cxml-stp
document structure."
(chtml:parse (clean-html page) (cxml-stp:make-builder)))
(defun stp->string (stpdoc)
(stp:serialize stpdoc (cxml:make-string-sink)))
(defun dump-page-string (page file)
(with-open-file (st file :direction :output :if-exists :supersede)
(write-string page st)))
;;; /webtils
(defun doomsday-anchor (tree)
"predicate to find what we're looking for in the sexp representing a
return from the bulletin of the atomic scientists, who are
apparently quite protective of their public bulletins, assuming they
aren't actively fucking with us."
(cond
((and (eq (car tree) :span)
(string-equal (second (first (second tree))) "fl-heading-text"))
tree)
(t
nil)))
(defun doomsday-extractor (tree)
"Extract the result from a Doomsday lookup."
(first (last (doomsday-anchor tree))))
(defun find-doomsday (tree)
"Find how many minutes to midnight according to the Bulletin of the Atomic Scientists."
(extract-from-html tree 'doomsday-anchor 'doomsday-extractor))
(defun twitter-twit (url)
"Convenience function to extract the Twitter user name from an URL."
(multiple-value-bind (whole parts) (scan-to-strings "twitter.com/(#!/)?([^/]+)/status/([0-9]+)" url)
(if whole
(elt parts 1)
nil)))
(defun twitter-payload (url)
"Fetch the text from a Twitter posting."
(let ((parts (scan-to-substrings "twitter.com/(#!/)?([^/]+)/status/([0-9]+)" url)))
(if parts
(let* ((twit (elt parts 1))
(twit-id-string (elt parts 2))
(twitter-spooge (with-open-stream
(s (drakma:http-request
(format nil "http://search.twitter.com/search.json?q=from:~A&max_id=~A" twit twit-id-string)
:want-stream t))
(json:decode-json-from-source s))))
(cdr (assoc :text (car (remove-if-not #'(lambda (x) (string= (cdr (assoc :id--str x)) twit-id-string))
(cdr (assoc :results twitter-spooge)))))))
(values nil (format nil "no extracted parts from url: ~A" url)))))
(defun metar-anchor (tree)
(eq :font (car tree)))
(defun metar-extractor (tree)
(third tree))
(defun find-metar (text)
(extract-from-html (chtml:parse text (chtml:make-lhtml-builder)) 'metar-anchor 'metar-extractor))
(defun papal-extractor (tree)
(second (second (second (fourth tree)))))
(defun papal-anchor (tree)
(and (equal (car tree) :MAP)
(string-equal (first (cdaadr tree)) "piechart")))
;; drakma is very thorough in checking the correctness of the HTML
;; it fetches. Unfortunately, it wants to see a newline character
;; at the end-of-file. The Hacker News website doesn't provide one.
;; *old-bogus-eols* stores the original value of
;; chunga:*accept-bogus-eols* so we can set it to t to suppress
;; this stringent checking.
(defvar *old-bogus-eols* nil)
(defvar *old-external-format* nil)
(defun start-web-client ()
"Initialize the system to request Web pages."
(setf *old-bogus-eols* chunga:*accept-bogus-eols*)
(setf chunga:*accept-bogus-eols* t)
(setf *old-external-format* drakma:*drakma-default-external-format*)
(setf drakma:*drakma-default-external-format* :utf-8))
(defun stop-web-client ()
"Clean up after requesting Web pages."
(setf chunga:*accept-bogus-eols* *old-bogus-eols*)
(setf *old-bogus-eols* nil)
(setf drakma:*drakma-default-external-format* *old-external-format*))