Imrpoving secruity analysis

rchatterjee · Nov 23, 2016 · f0fb62c · f0fb62c
1 parent 8df7b52
commit f0fb62c
Show file tree

Hide file tree

Showing 4 changed files with 94 additions and 52 deletions.
diff --git a/security/compute_guesses_numpy.py b/security/compute_guesses_numpy.py
@@ -48,10 +48,10 @@ def set_globals(settings_i):
     # MIN_ENT, REL_ENT, MAX_NH_SIZE, CACHE_SIZE,
     global N, MIN_ENTROPY_CUTOFF, REL_ENT_CUTOFF, MAX_NH_SIZE, CACHE_SIZE, Q
     settings = [
-        (1e4, 10, -3, 10, 5, 1000), # online w/ blacklist 
-        (1e4,  0,  0, 10, 5, 1000), # online w/o blacklist 
-        (1e5, 10, -3, 10, 5, 10000), # offline w/ blacklist 
-        (1e5,  0,  0, 10, 5, 10000), # offline w/o blacklist 
+        (1e4, 10, -3, 10, 5, 1000), # online w/ blacklist
+        (1e4,  0,  0, 10, 5, 1000), # online w/o blacklist
+        (1e5, 10, -3, 10, 5, 10000), # offline w/ blacklist
+        (1e5,  0,  0, 10, 5, 10000), # offline w/o blacklist
     ]
     (N, MIN_ENTROPY_CUTOFF, REL_ENT_CUTOFF, MAX_NH_SIZE, CACHE_SIZE, Q) = settings[settings_i]
     return settings[settings_i]
@@ -248,7 +248,7 @@ def create_pw_nh_graph(fname):
     #     split *= multiplier
 
 
-def read_pw_nh_graph(fname, q=-1):
+def read_pw_nh_graph(fname, q=-1, _N=-1):
     """Reads the typo trie file and the neighborhood map created by
     `create_pw_nh_graph` function.
 
@@ -270,6 +270,8 @@ def read_pw_nh_graph(fname, q=-1):
     """
     # N = 1000
     global N
+    if _N>0:
+        N = _N
     typodir = '{}/typodir'.format(pwd)
     pwm = Passwords(fname, max_pass_len=25, min_pass_len=5)
     N = min(N, len(pwm))
@@ -458,7 +460,7 @@ def compute_guesses_using_typodist(fname, q, nh_size=5, topk=False, offline=Fals
         proc_name = "TOPKTypo-{}-{}-{}".format
     else:
         proc_name = "TYPODIST-{}-{}-{}".format
-    proc_name = proc_name(MIN_ENTROPY_CUTOFF, REL_ENT_CUTOFF, 
+    proc_name = proc_name(MIN_ENTROPY_CUTOFF, REL_ENT_CUTOFF,
                           ('off' if offline else 'on'))
 
     pwm = Passwords(fname, max_pass_len=25, min_pass_len=5)
@@ -565,7 +567,12 @@ def get_trie_key(T, _id):
         return T.restore_key(_id)
     except KeyError:
         return ''
-
+def get_trie_id(T, key):
+    try:
+        return T.key_id(unicode(key))
+    except KeyError:
+        return -1
+
 proc_name = 'ALL'
 def compute_guesses_all(fname, q):
     """We computed neighborhood graph, considering the neighborhood graph
@@ -681,7 +688,7 @@ def run_all(offline=False):
     for p in processes: p.start()
     # for p in processes: p.join()
     return
-        
+
 if __name__ == '__main__':
     import sys
     # create_pw_db_(sys.argv[1])
@@ -711,7 +718,7 @@ def run_all(offline=False):
     # }
     # process['p_typodist'].start()
     # process['p_topk'].start()
-    
+
     # compute_guesses_using_typodist(fname, q, 5, True, offline=True)
     # compute_guesses_using_typodist(fname, q, 10, False)
     # process['p_typodist'].join()

diff --git a/security/compute_secloss.py b/security/compute_secloss.py
@@ -10,59 +10,92 @@
 from word2keypress import Keyboard
 from word2keypress.weighted_edist import sample_typos, get_topk_typos
 from zxcvbn import password_strength
-from compute_guesses_numpy import get_topk_typos, get_typodist_nh
+from compute_guesses_numpy import (
+    get_topk_typos, get_typodist_nh, read_pw_nh_graph, get_trie_key, get_trie_id,
+    N
+)
 
 KB = Keyboard()
 NH_SIZE = 10
 def compute_secloss(guess_file, attpwf, chlpwf, q=100):
-    chlpwm = Passwords(chlpwf)
-    attpwm = Passwords(attpwf)
+    chlpwm = Passwords(chlpwf, max_pass_len=25, min_pass_len=5)
+    attpwm = Passwords(attpwf, max_pass_len=25, min_pass_len=5)
     guesses = [w for w, _ in json.load(open(guess_file))]
     guess_set = set(guesses)
     q = len(guesses)
     print("Found {} guesses".format(q))
-    lambda_q = sum(chlpwm.pw2freq(pw) for _id, pw, f in attpwm.iterpws())/chlpwm.totalf()
+    lambda_q = sum(chlpwm.pw2freq(pw) for _id, pw, f
+                   in attpwm.iterpws(q))/float(chlpwm.totalf())
     print("Normal succces: {}".format(lambda_q))
     union_ball = set([
         rpw
         for w in guesses
         for rpw in KB.word_to_typos(str(w))
         if chlpwm.pw2id(rpw)>=0
-    ])
+    ]) | guess_set
+
     print("Worst case success rate = {}"\
-          .format(sum(chpwm.pw2freq(w) for w in union_ball)/chpwm.totalf()))
+          .format(sum(chlpwm.pw2freq(w) for w in union_ball)/float(chlpwm.totalf())))
+
+    # global N
+    # N = 10000
+    # M, A, typo_trie, _ = read_pw_nh_graph(chlpwf, N)
+    # Mprime = np.zeros((M.shape[0], NH_SIZE+1))
+    # B = [[] for _ in guesses]
+    # # for g in xrange(M.shape[0]):
+    # M = Mprime
+    # fuzzlambda_q = 0.0
+    # guess_key_ids = [get_trie_id(typo_trie, g) for g in guess_set]
+    # killed = []
+
+    # for rpw in union_ball:
+    #     try:
+    #         rpwid = typo_trie.key_id(unicode(rpw))
+    #         for g in guess_key_ids:
+    #             if (M[M[:, 0] == rpwid] == g).any:
+    #                 killed.append(rpw)
+    #     except KeyError:
+    #         continue
+    # fuzzlambda_q = sum([chlpwm.pw2freq(w) for w in killed])/chlpwm.totalf()
+    # for rpw in union_ball:
+    #     a = set(get_topk_typos(rpw, NH_SIZE+1)) & guess_set
+    #     if a:
+    #         print rpw, chlpwm.pw2freq(rpw)
 
-    lambda_corr_q = sum(
-        chpwm.pw2freq(rpw)
+    fuzzlambda_q = sum(
+        chlpwm.pw2freq(rpw)
         for rpw in union_ball
         if len(set(get_topk_typos(rpw, NH_SIZE)) & guess_set)>0
-    )/chpwm.totalf()
-    print("lambda-Topk Corr:", lambda_corr_q),
+    )/float(chlpwm.totalf())
+    # print("fuzzlambda_q:", fuzzlambda_q),
+
+    # lambda_topk_q = sum(
+    #     chlpwm.pw2freq(rpw)
+    #     for rpw in union_ball
+    #     if len(set(get_typodist_nh(rpw, NH_SIZE)) & guess_set)>0
+    # )/chlpwm.totalf()
+    print("fuzzlambda_q: ", fuzzlambda_q)
+    print("Secloss:", fuzzlambda_q - lambda_q)
+
+def compute_secloss_with_varying_q(guess_file, attpwf, chlpwf, q=100):
+    chlpwm = Passwords(chlpwf, max_pass_len=25, min_pass_len=5)
+    attpwm = Passwords(attpwf, max_pass_len=25, min_pass_len=5)
 
-    lambda_topk_q = sum(
-        chpwm.pw2freq(rpw)
-        for rpw in union_ball
-        if len(set(get_typodist_nh(rpw, NH_SIZE)) & guess_set)>0
-    )/chpwm.totalf()
-    print("lambda-typodist: ", lambda_topk_q)
-    print("Secloss:", lambda_topk_q - lambda_q)
-
-def compute_secloss_with_varying_q(guess_file, pwfname):
-    chpwm = Passwords(pwfname)
     guesses = [w for w, _ in json.load(open(guess_file))]
     guess_set = dict((g, i) for i, g in enumerate(guesses))
-    
+
     q = len(guesses)
     union_ball = list(set([
         rpw
         for w in guesses
         for rpw in KB.word_to_typos(str(w))
-        if chpwm.pw2id(rpw)>=0
+        if chlpwm.pw2id(rpw)>=0
     ]))
-    freqs = np.array([chpwm.pw2freq(w) for w in union_ball])
+
+    freqs = np.array([chlpwm.pw2freq(w) for w in union_ball])
     M = np.full((len(union_ball), NH_SIZE+1), -1, dtype=np.int32)
     for i, rpw in enumerate(union_ball):
-        for j, tpw in enumerate(get_typodist_nh(rpw, NH_SIZE)):
+        for j, tpw in enumerate(get_topk_typos(rpw, NH_SIZE)):
             M[i, j] = guess_set.get(tpw, -1)
     print("Useful typos:", (M>0).sum())
     tq = 1
@@ -74,19 +107,19 @@ def compute_secloss_with_varying_q(guess_file, pwfname):
             last_suc = 0
         for g in guesses[tq:tq*10]:
             t = guess_set[g]
-            last_suc += freqs[(M==t).sum(axis=1)>0].sum()
-            freqs[(M==t).sum(axis=0)>0] = 0
-        lambda_topk_q.append((tq*10, last_suc/chpwm.totalf()))
-        print(tq, lambda_topk_q[-1])
+            last_suc += freqs[(M==t).sum(axis=1)>0].sum()/float(chlpwm.totalf())
+            freqs[(M==t).sum(axis=1)>0] = 0
+        lambda_topk_q.append((tq*10, last_suc))
+        print(lambda_topk_q[-1])
         tq *= 10
 
     with open('guess_file.csv', 'wb') as f:
         csvf = csv.writer(f)
-        csvf.writerow('q,lambda_q,lambda_typodist_q'.split())
+        csvf.writerow('q,lambda_q,secloss'.split())
         for tq, succ in lambda_topk_q:
-            lambda_q = chpwm.sumvalues(tq)/chpwm.totalf()
-            csvf.writerow([tq, lambda_q, succ])
-    
+            lambda_q = chlpwm.sumvalues(tq)/float(chlpwm.totalf())
+            csvf.writerow([tq, lambda_q, succ-lambda_q])
+
 if __name__ == "__main__":
     compute_secloss(sys.argv[1], sys.argv[2], sys.argv[3])
-
+    compute_secloss_with_varying_q(sys.argv[1], sys.argv[2], sys.argv[3])
diff --git a/security/guess_file.csv b/security/guess_file.csv
@@ -1,6 +1,6 @@
-"q,lambda_q,lambda_typodist_q"
-0,1.0,0
-10,0.020536894151018417,3.0385196823181632e-06
-100,0.045536913333592173,2.4553703728431483e-07
-1000,0.11299376903966357,7.5360517224896903e-15
-10000,0.22297136225891534,2.31297388744976e-22
+"q,lambda_q,secloss"
+10,0.027934273219508185,-0.0083704942036872468
+100,0.056984351325850266,-0.02707686525383585
+1000,0.12889699750607819,-0.068083673621198479
+10000,0.27439403964435188,-0.155049898011522
+100000,0.66960821545605098,-0.43270522000936773
diff --git a/security/results.txt b/security/results.txt
@@ -1,5 +1,5 @@
 (10000.0, 10, -3, 5)
-gi=2386, aASSWORD -> 1 ([u'password']), 
+gi=2386, aASSWORD -> 1 ([u'password']),
 (TOPKTypo-10--3-on): 1> aASSWORD                      : 1.825e-01 (killed=1/1)
 (TOPKTypo-10--3-on): 2> lLOVEYOU                      : 1.533e-01 (killed=1/2)
 (TOPKTypo-10--3-on): 3> rRINCESS                      : 1.022e-01 (killed=1/3)
@@ -988,7 +988,7 @@ RPW freq is zero! rpw=jESUS1, f=0, guess=eesus1
 (TOPKTypo-10--3-on): 982> eeyore                        : 3.192e-04 (killed=2/1298)
 (TOPKTypo-10--3-on): 983> aariel                        : 3.192e-04 (killed=2/1300)
 (TOPKTypo-10--3-on): 984> eeresa                      (10000.0, 0, 0, 5)
-gi=52654, 1123456 -> 15 ([u'123456', u'0123456', u'A123456', u'J123456', u'M123456', u'S123456', u'K123456', u'B123456', u'D123456', u'C123456', u'T123456', u'R123456', u'L123456', u'E123456', u'a123456']), 
+gi=52654, 1123456 -> 15 ([u'123456', u'0123456', u'A123456', u'J123456', u'M123456', u'S123456', u'K123456', u'B123456', u'D123456', u'C123456', u'T123456', u'R123456', u'L123456', u'E123456', u'a123456']),
 (TOPKTypo-0-0-on): 1> 1123456                       : 9.064e-01 (killed=15/15)
 (TOPKTypo-0-0-on): 2> 112345                        : 2.485e-01 (killed=10/25)
 (TOPKTypo-0-0-on): 3> 1123456789                    : 2.462e-01 (killed=3/28)
@@ -2246,7 +2246,7 @@ Pwid changed for u'CONTRASE\xd1A' -> 'CONTRASEA'
 (TOPKTypo-10--3-off) Processed: 97000
 (TOPKTypo-10--3-off) Processed: 98000
 (TOPKTypo-10--3-off) Processed: 99000
-gi=276748, 1123456 -> 38 ([u'123456', u'0123456', u'A123456', u'J123456', u'M123456', u'S123456', u'K123456', u'B123456', u'D123456', u'C123456', u'T123456', u'R123456', u'L123456', u'E123456', u'a123456', u'N123456', u'P123456', u'H123456', u'Q123456', u'G123456', u'1123456', u'1123456', u'V123456', u'F123456', u'Z123456', u'j123456', u'W123456', u'I123456', u'Y123456', u'O123456', u'k123456', u'm123456', u's123456', u'c123456', u'X123456', u'd123456', u'`123456', u'l123456']), 
+gi=276748, 1123456 -> 38 ([u'123456', u'0123456', u'A123456', u'J123456', u'M123456', u'S123456', u'K123456', u'B123456', u'D123456', u'C123456', u'T123456', u'R123456', u'L123456', u'E123456', u'a123456', u'N123456', u'P123456', u'H123456', u'Q123456', u'G123456', u'1123456', u'1123456', u'V123456', u'F123456', u'Z123456', u'j123456', u'W123456', u'I123456', u'Y123456', u'O123456', u'k123456', u'm123456', u's123456', u'c123456', u'X123456', u'd123456', u'`123456', u'l123456']),
 (TOPKTypo-10--3-off): 1> 1123456                       : 9.154e-01 (killed=38/1)
 (TOPKTypo-10--3-off): 2> 223456                        : 8.937e-01 (killed=5/2)
 (TOPKTypo-10--3-off): 3> 1234561                       : 8.927e-01 (killed=2/3)
@@ -8393,7 +8393,7 @@ Pwid changed for u'CONTRASE\xd1A' -> 'CONTRASEA'
 (TOPKTypo-0-0-off) Processed: 97000
 (TOPKTypo-0-0-off) Processed: 98000
 (TOPKTypo-0-0-off) Processed: 99000
-gi=276748, 1123456 -> 38 ([u'123456', u'0123456', u'A123456', u'J123456', u'M123456', u'S123456', u'K123456', u'B123456', u'D123456', u'C123456', u'T123456', u'R123456', u'L123456', u'E123456', u'a123456', u'N123456', u'P123456', u'H123456', u'Q123456', u'G123456', u'1123456', u'1123456', u'V123456', u'F123456', u'Z123456', u'j123456', u'W123456', u'I123456', u'Y123456', u'O123456', u'k123456', u'm123456', u's123456', u'c123456', u'X123456', u'd123456', u'`123456', u'l123456']), 
+gi=276748, 1123456 -> 38 ([u'123456', u'0123456', u'A123456', u'J123456', u'M123456', u'S123456', u'K123456', u'B123456', u'D123456', u'C123456', u'T123456', u'R123456', u'L123456', u'E123456', u'a123456', u'N123456', u'P123456', u'H123456', u'Q123456', u'G123456', u'1123456', u'1123456', u'V123456', u'F123456', u'Z123456', u'j123456', u'W123456', u'I123456', u'Y123456', u'O123456', u'k123456', u'm123456', u's123456', u'c123456', u'X123456', u'd123456', u'`123456', u'l123456']),
 (TOPKTypo-0-0-off): 1> 1123456                       : 9.154e-01 (killed=38/1)
 (TOPKTypo-0-0-off): 2> 223456                        : 8.937e-01 (killed=5/2)
 (TOPKTypo-0-0-off): 3> 1234561                       : 8.927e-01 (killed=2/3)
@@ -22387,3 +22387,5 @@ RPW freq is zero! rpw=JJAMES, f=0, guess=JJAMES
 (TOPKTypo-0-0-off): Total fuzzy success: 115.734034382
 (TOPKTypo-0-0-off): Total normal success: 22.2971362259
 ('Saving the guesses:', 'guesses/rockyou-withcount_guesses_10000_typodist_5_TOPKTypo-0-0-off.json')
+Done creating all the parts
+rockyou-withcount__0_2000000_typo.trie and rockyou-withcount__0_2000000_typo.trie exits. So returning