This repository has been archived by the owner on Apr 29, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 14
/
whoisThread.py
727 lines (634 loc) · 26.5 KB
/
whoisThread.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
# -*- coding: utf-8 -*-
import threading
import proxywhois
import socks
import sys
import time
import traceback
import re
import urlparse
import config
import string
import random
import datetime
#NULL whois result Exception
class NullWhoisException(Exception):
count = 0
def __init__(self, value, whitespace=False):
NullWhoisException.count += 1
self.value = value
self.whitespace = whitespace
def __str__(self):
return "Null Whois: "+repr(self.value)
# this has been deprecated in favor of the exceptions inside proxywhois
class WhoisTimeoutException(Exception):
count = 0
def __init__(self, value):
WhoisTimeoutException.count += 1
self.value = value
def __str__(self):
return "Whois Timeout on: "+repr(self.value)
class WhoisLinesException(Exception):
count = 0
def __init__(self, value,data):
WhoisLinesException.count += 1
self.value = value
self.data = data
def __str__(self):
return "Response Too Small: "+repr(self.value)+"\n"+repr(self.data)
class WhoisRatelimitException(Exception):
count = 0
def __init__(self, server, hard_limit=True, forceInc=False):
if hard_limit:
WhoisRatelimitException.count += 1
self.server = server
self.hard = hard_limit
self.forceInc = forceInc
def strict(self):
if self.server in config.STRICT_SERVERS:
return True
return False
def __str__(self):
return "Whois Ratelimit Reached on: "+repr(self.server)+" Hard Limit: "+str(self.hard)
class WhoisBadDomainException(Exception):
count = 0
def __init__(self, domain):
WhoisBadDomainException.count += 1
self.domain = domain
def __str__(self):
return "Invalid Domain: "+repr(self.domain)
#TODO currently unused
class WhoisHTTPReferralException(Exception):
'''whois result refers us to a web address, possibly due to rate limiting'''
count = 0
def __init__(self, domain, server, url):
WhoisHTTPReferralException.count += 1
self.domain = domain
self.server = server
self.url = url
def __str__(self):
return "HTTP Refferal domain: "+repr(self.domain)+" server: "+repr(self.server)+" url: "+repr(self.url)
def printExceptionCounts():
print "WhoisRatelimitException:\t" + str(WhoisRatelimitException.count)
print "NullWhoisException:\t" + str(NullWhoisException.count)
print "WhoisTimeoutException:\t" + str(WhoisTimeoutException.count)
print "WhoisBadDomainException:\t" + str(WhoisBadDomainException.count)
print "WhoisHTTPReferralException:\t" + str(WhoisHTTPReferralException.count)
print "WhoisLinesException:\t" + str(WhoisLinesException.count)
#static vars
numActiveThreads_lock = threading.Lock()
numActiveThreads = 0
numProxyThreads_lock = threading.Lock()
numProxyThreads = 0
proxy_ip_list_lock = threading.Lock()
proxy_ip_list = list()
numLookups_lock = threading.Lock()
numLookups = 0
def removeRemoteProxyIP(ip):
global proxy_ip_list_lock
global proxy_ip_list
proxy_ip_list_lock.acquire()
try:
if ip in proxy_ip_list:
proxy_ip_list.remove(ip)
else:
print "Cant remove IP from list it is not in "+ str(ip)
finally:
proxy_ip_list_lock.release()
def addRemoteProxyIP(ip):
global proxy_ip_list_lock
global proxy_ip_list
proxy_ip_list_lock.acquire()
ret = None
try:
if not ip in proxy_ip_list:
proxy_ip_list.append(ip)
ret = True
else:
ret = False
finally:
proxy_ip_list_lock.release()
return ret
def incrementLookupCount():
global numLookups_lock
global numLookups
numLookups_lock.acquire()
try:
numLookups += 1
finally:
numLookups_lock.release()
def getLookupCount():
global numActiveThreads_loc
global numActiveThreads
ret = -1
numLookups_lock.acquire()
try:
ret = numLookups
finally:
numLookups_lock.release()
return ret
'''
Active threads are threaads that are not sleeing and activly querying a reccord
'''
def incrementActiveThreadCount():
global numActiveThreads_lock
global numActiveThreads
numActiveThreads_lock.acquire()
try:
numActiveThreads += 1
finally:
numActiveThreads_lock.release()
def decrementActiveThreadCount():
global numActiveThreads_lock
global numActiveThreads
numActiveThreads_lock.acquire()
try:
numActiveThreads -= 1
finally:
numActiveThreads_lock.release()
def getActiveThreadCount():
global numActiveThreads_lock
global numActiveThreads
ret = -1
numActiveThreads_lock.acquire()
try:
ret = numActiveThreads
finally:
numActiveThreads_lock.release()
return ret
'''
Proxy threads are threads with working proxies
'''
def incrementProxyThreadCount():
global numProxyThreads_lock
global numProxyThreads
numProxyThreads_lock.acquire()
try:
numProxyThreads += 1
finally:
numProxyThreads_lock.release()
def decrementProxyThreadCount():
global numProxyThreads_lock
global numProxyThreads
numProxyThreads_lock.acquire()
try:
numProxyThreads -= 1
finally:
numProxyThreads_lock.release()
def getProxyThreadCount():
global numProxyThreads_lock
global numProxyThreads
ret = -1
numProxyThreads_lock.acquire()
try:
ret = numProxyThreads
finally:
numProxyThreads_lock.release()
return ret
#this object is used to store the results of a whois result as it is passed around
class WhoisResult:
def __init__(self, domain):
self.domain = domain
self.attempts = list()
self.current_attempt = None
self.maxAttempts = False
self.next_whois_server = None
self.fails = 0
def getNextServer(self):
return self.next_whois_server
def setNextServer(self,server):
self.next_whois_server = server
def valid(self):
'''performs quick checking to verify that the data we got may contain some valid data'''
#search for email
match = re.search(config.EMAIL_REGEX, self.getThickData())
if match:
return True
return False
def addAttempt(self, attempt):
self.attempts.append(attempt)
self.current_attempt = self.attempts[-1]
return self.current_attempt
def addError(self, error, fail=True):
if fail:
self.fails += 1
if self.current_attempt:
self.current_attempt.addError(error)
else:
print "ERROR: Adding error to result without attempt"
def getLogData(self):
log = list()
log.append("DOMAIN: "+self.domain)
log.append("Fails: "+str(self.fails))
log.append("Max Attempts: "+ str(self.maxAttempts))
log.append("Last Whois Server: "+ str(self.next_whois_server))
for (num, attempt) in enumerate(self.attempts):
if not (attempt.success == False and len(attempt.responses) == 0 and len(attempt.errors) == 0):
log.append("-----------Attempt:"+str(num)+"------------")
#dont log when one proxy hands off to another without doing any work
log += attempt.getLogData()
return log
def getAllData(self,all_data=True):
"""Returnes the string response of the last response on the last attempt"""
""" there is a bug here when a failure ofccored on a thick server the thin data is not saved"""
""" deprecating this function in favor of getthick and getthin """
if all_data:
return self.attempts[-1].getResponse()
else:
return self.attempts[-1].getLastResponse()
def getThickData(self):
for attempt in self.attempts[::-1]:
r = attempt.getThickResponse()
if r:
return r.getResponse()
return None
def getThinData(self):
for attempt in self.attempts[::-1]:
r = attempt.getThinResponse()
if r:
return r.getResponse()
return None
def numFails(self):
return self.fails
def getLastAttempt(self):
if len(self.attempts) > 0:
return self.attempts[-1]
else:
return None
#class to hold details on an attempt to whois a particular domain
class WhoisAttempt:
def __init__(self, proxy):
#timestamp (float)
self.timestamp = time.time()
self.success = False
self.proxy = proxy
self.errors = list()
self.responses = list() #contains a list of WhoisResponse classes in the order they were queried
def addError(self,error):
self.errors.append(error)
def getLogData(self):
log = list()
log.append("Timestamp: "+ str(self.timestamp))
log.append("Proxy: "+ self.proxy.getLog())
log.append("Success: "+ str(self.success))
log.append("Responses: "+str(len(self.responses)))
for response in self.responses:
log += response.getLogData()
numErrors = len(self.errors)
log.append("Errors: "+ str(numErrors))
for error in self.errors:
log.append("--Error: "+str(error))
return log
def getLastResponse(self):
if len(self.responses) > 0:
return self.responses[-1]
else:
return None
def getResponse(self):
if len(self.responses) < 1:
return None
else:
ret = ""
for response in self.responses:
ret += response.getResponse()
ret += "\n"
return ret
def getThickResponse(self):
for r in self.responses[::-1]:
if r.resultType == ResultType.Thick:
return r
return None
def getThinResponse(self):
for r in self.responses[::-1]:
if r.resultType == ResultType.Thin:
return r
return None
def addResponse(self,response):
self.responses.append(response)
""" Class to represent thick / thin enum types """
class ResultType():
Unknown = 0
Thin = 1
Thick = 2
"""Class used to store the response of an individual
whois query, may be a thick or thin result"""
class WhoisResponse:
def __init__(self, server):
self.server = server
self.response = None
self.resultType = ResultType.Unknown
def setResponse(self,response):
self.response = response
def getResponse(self):
return self.response
def getServer(self):
return self.server
def getType(self):
return self.resultType
def setType(self, t):
self.resultType = t
def getLogData(self):
log = list()
log.append("WHOIS server: "+str(self.server))
log.append("======Response=====================")
log.append(str(self.response))
log.append("===================================")
return log
#class to hold a proxy object
class Proxy:
def __init__(self,ip, port, proxy_type):
self.server = ip
self.port = port
self.proxy_type = proxy_type
self.external_ip = None
self.ready = False
self.errors = 0
self.client = proxywhois.NICClient()
self.history = dict()
self.nextHistoryTrim = time.time()
def connect(self):
self.updateExternalIP()
self.client.set_proxy(self.proxy_type, self.server, self.port)
if not self.external_ip:
return False
self.ready = True
return self.ready
def getLog(self):
return str(self) +" Errors: "+ str(self.errors)
def __repr__(self):
ret = "Server:"+self.server +":"+str(self.port)
if self.external_ip:
ret += " ExtIP:"+self.external_ip
return ret
def updateExternalIP(self):
"""this method uses the proxy socket to get the remote IP on that proxy"""
host = "http://www.sysnet.ucsd.edu/cgi-bin/whoami.sh"
url = urlparse.urlparse(host)
for i in range(3): #try 3 times
try:
s = socks.socksocket(socks.socket.AF_INET, socks.socket.SOCK_STREAM)
s.settimeout(config.WHOIS_TIMEOUT_SECONDS)
s.setproxy(self.proxy_type,self.server, self.port)
s.connect((url.hostname, 80))
s.send('GET '+url.path+' HTTP/1.0\r\nHost: '+url.hostname+'\r\n\r\n')
r = s.recv(4096)
except Exception as e:
time.sleep(0.1)
else:
if len(r):
self.external_ip = r.split()[-1]
return self.external_ip
time.sleep(0.1)
return None
def trimHistory(self, t):
if t > self.nextHistoryTrim:
self.nextHistoryTrim = t + datetime.timedelta(minutes=config.WHOIS_HISTORY_TRIM_MINUTES).total_seconds()
trimAge = t - config.WHOIS_SERVER_JUMP_DELAY
for server, lastSeen in self.history.items():
if lastSeen < trimAge:
del self.history[server]
def whois(self,record):
"""This fucnction is a replacment of whois_lookup
from the proxywhois class"""
if not self.ready:
return False
# this is the maximum amout of times we will recurse looking for
# a thin whois server to reffer us
recurse_level = 2
whois_server = record.getNextServer()
if whois_server == None:
# find inital whois server
whois_server = self.client.choose_server(record.domain)
while (recurse_level > 0) and (whois_server != None):
whois_server = whois_server.lower()
record.setNextServer(whois_server)
t = time.time()
if whois_server in self.history:
tdelta = t - self.history[whois_server]
if tdelta < config.WHOIS_SERVER_JUMP_DELAY: #if the amount of time since the last query is less than the delay
if (config.WHOIS_SERVER_JUMP_DELAY-tdelta) < config.WHOIS_SERVER_SLEEP_DELAY: #if the time left to wait is less then the sleep delay
decrementActiveThreadCount()
time.sleep(config.WHOIS_SERVER_JUMP_DELAY-tdelta)
incrementActiveThreadCount()
else:
time.sleep(random.random()) #this protects us from busy waiting
raise WhoisRatelimitException(whois_server, False)
self.history[whois_server] = t
#TODO have thread remove old entries from history every x runs (runs % x)
# currently useing time
self.trimHistory(t)
response = WhoisResponse(whois_server)
incrementLookupCount()
data = None
try:
data = self.client.whois(record.domain, whois_server, 0)
except proxywhois.ServerTroubleException as e:
raise WhoisRatelimitException(whois_server, False, True)
if data == None or len(data) < 1:
error = "Error: Empty response recieved for domain: "+record.domain+" on server: "+whois_server+" Using proxy: "+self.server
if config.DEBUG:
print error
#TODO this may often be a WhoisRatelimitException case
raise NullWhoisException(error)
response.setResponse(data)
record.getLastAttempt().addResponse(response)
nLines = data.count('\n')
if nLines < config.MIN_RESPONSE_LINES: #if we got less than the minimul amount of lines to be considered a valid response
data_lower = data.lower()
if len(data_lower.strip()) == 0:
raise NullWhoisException("whitespace response",True)
#TODO move these checks into a response checking function
''' check for rate limits'''
#TODO parse limit and add to exception
if "limit exceeded" in data_lower:
raise WhoisRatelimitException(whois_server)
if "please note that the query limit is" in data_lower:
raise WhoisRatelimitException(whois_server)
if "quota exceeded" in data_lower:
raise WhoisRatelimitException(whois_server)
if "try again later" in data_lower:
raise WhoisRatelimitException(whois_server)
if "limit reached" in data_lower:
raise WhoisRatelimitException(whois_server)
if "IP addresses that may have failed" in data_lower:
raise WhoisRatelimitException(whois_server)
'''non-existant domain'''
if "invalid domain name" in data_lower:
raise WhoisBadDomainException(record.domain)
if "no match" in data_lower:
raise WhoisBadDomainException(record.domain)
if " is not registered here." in data_lower:
raise WhoisBadDomainException(record.domain)
if "not found" in data_lower:
raise WhoisBadDomainException(record.domain)
if "can't get information on local domain" in data_lower:
raise WhoisBadDomainException(record.domain)
if "no information available" in data_lower:
raise WhoisBadDomainException(record.domain)
if "no matching record" in data_lower:
raise WhoisBadDomainException(record.domain)
if "invalid query" in data_lower:
raise WhoisBadDomainException(record.domain)
if "out of this registry" in data_lower:
raise WhoisBadDomainException(record.domain)
if "out of registry" in data_lower:
raise WhoisBadDomainException(record.domain)
if "domain name invalid format" in data_lower:
raise WhoisBadDomainException(record.domain)
if "no data found" in data_lower:
raise WhoisBadDomainException(record.domain)
if "incorrect domain name" in data_lower:
raise WhoisBadDomainException(record.domain)
if "no domain" in data_lower:
raise WhoisBadDomainException(record.domain)
if "no found" in data_lower: # yes, that is 2 spaces
raise WhoisBadDomainException(record.domain)
if "whois service not available for this domain" in data_lower:
raise WhoisBadDomainException(record.domain)
if "we do not have an entry in our database matching your query" in data_lower:
raise WhoisBadDomainException(record.domain)
if "syntax error in specified domain name" in data_lower:
raise WhoisBadDomainException(record.domain)
if "not exists" in data_lower:
raise WhoisBadDomainException(record.domain)
if "we're sorry, there has been a problem. technicians have been notified" in data_lower:
raise WhoisBadDomainException(record.domain)
if u'網域名稱不合規定' in data_lower:
raise WhoisBadDomainException(record.domain)
''' http whois errors'''
#TODO WGET http url
#TODO some of these http errors may never acually be seen due to the linux whois client being hardcoded
if "this tld has no whois server, but you can access the whois database at" in data_lower:
#url = data.splitlines()[-1])
#raise WhoisHTTPReferralException(record.domain, whois_server, url)
return response
if 'registered\nnot the default registrar' in data_lower:
#url = data.splitlines()[-1])
#raise WhoisHTTPReferralException(record.domain, whois_server, url)
return response
if 'this tld has no whois server, but you can access the whois database at' in data_lower:
#url = data.splitlines()[-1])
#raise WhoisHTTPReferralException(record.domain, whois_server, url)
return response
#corner case
if whois_server == "to.whois-servers.net" and "tonic whoisd" in data_lower:
pass
elif whois_server == "it.whois-servers.net" and "unassignable" in data_lower:
pass
else:
error = "Error: recieved small "+str(nLines)+" response for domain: "+record.domain+" on server: "+whois_server+" Using proxy: "+self.server
raise WhoisLinesException(error,data)
recurse_level -= 1
if recurse_level == 0:
response.setType(ResultType.Thick)
else:
whois_server = self.client.findwhois_server(response.getResponse(),whois_server) # get next whois server if exists
if whois_server == None:
# mark response as thick
response.setType(ResultType.Thick)
else:
# mark response as thin
response.setType(ResultType.Thin)
return response #returns the last response used
#main thread which handles all whois lookups, one per proxy
class WhoisThread(threading.Thread):
def __init__(self, proxy, queue, save):
threading.Thread.__init__(self)
self.daemon = True
self.queue = queue
self.proxy = proxy
self.save_queue = save
self.running = False
def fail(self, record, error, requeue=True, failIncrement=True):
if failIncrement or config.DEBUG:
self.proxy.errors += 1
record.addError(error, failIncrement)
if config.DEBUG:
print "["+ str(self.proxy) +"] "+ str(error)
if requeue and record.numFails() < config.MAX_ATTEMPTS:
self.queue.put(record)
else:
record.maxAttempts = True
decrementActiveThreadCount()
self.save_queue.put(record)
incrementActiveThreadCount()
def run(self):
# distribute proxys starting up
time.sleep(random.randrange(0, 5))
while True:
#get and print my remote IP, also tests the proxy for usability
#wait untill proxy is active if down
while not self.proxy.connect():
if config.DEBUG:
print "WARNING: Failed to connect to proxy: " + str(self.proxy)
time.sleep(config.PROXY_FAIL_RECONNECT_DELAY)
if not addRemoteProxyIP(self.proxy.external_ip):
if config.DEBUG:
print "WARNING: Proxy is already being used ["+self.proxy.server+"] on port: "+str(self.proxy.port)+" with remote IP: "+self.proxy.external_ip
#return
# dont return, insteak keep waiting just in case
time.sleep(config.PROXY_FAIL_RECONNECT_DELAY*3)
continue
self.running = True
incrementProxyThreadCount()
while self.running:
#get next host
record = self.queue.get()
incrementActiveThreadCount()
record.addAttempt(WhoisAttempt(self.proxy))
try:
if config.DEBUG:
print str(self.proxy) +" trying to whois: "+record.domain
self.proxy.whois(record)
if config.DEBUG:
print str(self.proxy) +" whois return on: "+record.domain
except proxywhois.WhoisNoServerException as e:
#the domain does not have a valid known whois server, may be an http server
#nothing we can do, skip domain
self.fail(record, str(e), False)
except WhoisRatelimitException as e:
#we reached a server who's wait is more than the allowed sleeping time
#give the request to another server
if e.hard:
#TODO dynamically change whois server allowed rate
self.fail(record, str(e), True, (config.LAZY_MODE or e.forceInc))
else:
self.queue.put(record)
except proxywhois.socks.GeneralProxyError as e:
if e.value[0] == 6: #is there a proxy error?
error = "Unable to connect to once valid proxy"
if config.DEBUG:
print error
record.addError(error)
self.queue.put(record)
self.running = False
# make sure to remove us from the active IP list
removeRemoteProxyIP(self.proxy.external_ip)
else:
error = "Error Running whois on domain:["+record.domain+"] " + str(e)
self.fail(record,error)
except (proxywhois.socks.HTTPError, proxywhois.socks.Socks4Error, proxywhois.socks.Socks5Error) as e:
#bad domain name
error = "Invalid domain: " + record.domain
self.fail(record,error)
except (NullWhoisException, WhoisTimeoutException, WhoisLinesException) as e:
self.fail(record, str(e))
except WhoisBadDomainException as e:
self.fail(record, str(e), False)
except WhoisBadDomainException as e:
error = "FAILED: [" + record.domain + "] error: " + str(sys.exc_info()[0])
self.fail(record,error)
else:
if (not config.RESULT_VALIDCHECK) or record.valid():
record.current_attempt.success = True
decrementActiveThreadCount()
self.save_queue.put(record)
incrementActiveThreadCount()
else:
error = "INVALID RESULT: [" + record.domain + "] Failed validity check"
self.fail(record,error)
finally:
#inform the queue we are done
self.queue.task_done()
decrementActiveThreadCount()
decrementProxyThreadCount()