Skip to content
This repository has been archived by the owner on Dec 13, 2024. It is now read-only.

Commit

Permalink
Add weighing of terms for cosine similarity
Browse files Browse the repository at this point in the history
For the cosine similarity comparison between user query and
CPE entries, an exponential decay function is now used for
assigning decreasing weights to terms according to their position
in a CPE entry.

This has the effect that terms that appear later
in a CPE entry contribute less to the comparison. It solves a
general problem, wherein wrong software CPEs were returned if only
one word between user query and CPE entry matched, for example.
  • Loading branch information
ra1nb0rn committed Dec 12, 2023
1 parent fbf5aa3 commit 9c6bd48
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 23 deletions.
31 changes: 28 additions & 3 deletions cpe_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
DEPRECATED_CPES_FILE = os.path.join(SCRIPT_DIR, "deprecated-cpes.json")
DB_URI, DB_CONN_MEM = 'file:cpedb?mode=memory&cache=shared', None
TEXT_TO_VECTOR_RE = re.compile(r"[\w+\.]+")
CPE_TERM_WEIGHT_EXP_FACTOR = -0.08
GET_ALL_CPES_RE = re.compile(r'(.*);.*;.*')
VERSION_MATCH_ZE_RE = re.compile(r'\b([\d]+\.?){1,4}\b')
VERSION_MATCH_CPE_CREATION_RE = re.compile(r'\b((\d+[\.\-]?){1,4}([a-z\d]{0,3})?)[^\w]*$')
Expand Down Expand Up @@ -116,16 +117,40 @@ def perform_calculations(cpes, requestno):

cpe_info = []
for cpe in cpes:
# prepare CPE and its name for computation of cosine similarity values
cpe_mod = cpe.split(';')[0].replace("_", ":").replace("*", "").replace("\\", "")
cpe_name = cpe.split(';')[1].lower()
cpe_name_elems = [word for word in cpe_name.split()]
cpe_elems = [cpe_part for cpe_part in cpe_mod[10:].split(':') if cpe_part != ""]
words = TEXT_TO_VECTOR_RE.findall(" ".join(cpe_elems + cpe_name_elems))
cpe_tf = Counter(words)
cpe_name_elems = [word for word in cpe_name.split()]

# compute term weights with exponential decay according to word position
words_cpe = TEXT_TO_VECTOR_RE.findall(' '.join(cpe_elems))
words_cpe_name = TEXT_TO_VECTOR_RE.findall(' '.join(cpe_name_elems))
word_weights_cpe = {}
for i, word in enumerate(words_cpe):
if word not in word_weights_cpe: # always use greatest weight
word_weights_cpe[word] = math.exp(CPE_TERM_WEIGHT_EXP_FACTOR * i)

word_weights_cpe_name = {}
for i, word in enumerate(words_cpe_name):
if word not in word_weights_cpe_name: # always use greatest weight
word_weights_cpe_name[word] = math.exp(CPE_TERM_WEIGHT_EXP_FACTOR * i)

# compute CPE entry's cosine vector for similarity comparison
cpe_tf = Counter(words_cpe + words_cpe_name)
for term, tf in cpe_tf.items():
cpe_tf[term] = tf / len(cpe_tf)
if term in word_weights_cpe and term in word_weights_cpe_name:
# average both obtained weights from CPE itself and its name
cpe_tf[term] *= 0.5 * word_weights_cpe[term] + 0.5 * word_weights_cpe_name[term]
elif term in word_weights_cpe:
cpe_tf[term] *= word_weights_cpe[term]
elif term in word_weights_cpe_name:
cpe_tf[term] *= word_weights_cpe_name[term]

cpe_abs = math.sqrt(sum([cnt**2 for cnt in cpe_tf.values()]))
cpe_info.append((cpe.split(';')[0].lower(), cpe_tf, cpe_abs))

return cpe_info


Expand Down
40 changes: 20 additions & 20 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,91 +9,91 @@ def test_search_wp_572(self):
self.maxDiff = None
query = 'WordPress 5.7.2'
test_best_match_cpe = 'cpe:2.3:a:wordpress:wordpress:5.7.2:*:*:*:*:*:*:*'
test_best_match_score = '0.98058067569092'
test_best_match_score = 0.9686485306860276
result = search_cpes(queries=[query])
self.assertEqual(result[query][0][0], test_best_match_cpe)
self.assertEqual(str(result[query][0][1]), test_best_match_score)
self.assertAlmostEqual(result[query][0][1], test_best_match_score)

def test_search_apache_2425(self):
self.maxDiff = None
query = 'Apache 2.4.25'
test_best_match_cpe = 'cpe:2.3:a:apache:http_server:2.4.25:*:*:*:*:*:*:*'
test_best_match_score = '0.7372097807744855'
test_best_match_score = 0.7427124236405216
result = search_cpes(queries=[query])
self.assertEqual(result[query][0][0], test_best_match_cpe)
self.assertEqual(str(result[query][0][1]), test_best_match_score)
self.assertAlmostEqual(result[query][0][1], test_best_match_score)

def test_search_proftpd_133c(self):
self.maxDiff = None
query = 'Proftpd 1.3.3c'
test_best_match_cpe = 'cpe:2.3:a:proftpd:proftpd:1.3.3:c:*:*:*:*:*:*'
test_best_match_score = '0.8333333333333334'
test_best_match_score = 0.829017833421458
result = search_cpes(queries=[query])
self.assertEqual(result[query][0][0], test_best_match_cpe)
self.assertEqual(str(result[query][0][1]), test_best_match_score)
self.assertAlmostEqual(result[query][0][1], test_best_match_score)

def test_search_thingsboard_341(self):
self.maxDiff = None
query = 'Thingsboard 3.4.1'
test_best_match_cpe = 'cpe:2.3:a:thingsboard:thingsboard:3.4.1:*:*:*:*:*:*:*'
test_best_match_score = '0.98058067569092'
test_best_match_score = 0.9686485306860276
result = search_cpes(queries=[query])
self.assertEqual(result[query][0][0], test_best_match_cpe)
self.assertEqual(str(result[query][0][1]), test_best_match_score)
self.assertAlmostEqual(result[query][0][1], test_best_match_score)

def test_search_redis_323(self):
self.maxDiff = None
query = 'Redis 3.2.3'
test_best_match_cpe = 'cpe:2.3:a:redis:redis:3.2.3:*:*:*:*:*:*:*'
test_best_match_score = '0.98058067569092'
test_best_match_score = 0.9686485306860276
result = search_cpes(queries=[query])
self.assertEqual(result[query][0][0], test_best_match_cpe)
self.assertEqual(str(result[query][0][1]), test_best_match_score)
self.assertAlmostEqual(result[query][0][1], test_best_match_score)

def test_search_piwik_045(self):
self.maxDiff = None
query = 'Piwik 0.4.5'
test_best_match_cpe = 'cpe:2.3:a:piwik:piwik:0.4.5:*:*:*:*:*:*:*'
test_best_match_score = '0.98058067569092'
test_best_match_score = 0.9686485306860276
result = search_cpes(queries=[query])
self.assertEqual(result[query][0][0], test_best_match_cpe)
self.assertEqual(str(result[query][0][1]), test_best_match_score)
self.assertAlmostEqual(result[query][0][1], test_best_match_score)

def test_search_vmware_spring_framework_5326(self):
self.maxDiff = None
query = 'VMWare Spring Framework 5.3.26'
test_best_match_cpe = 'cpe:2.3:a:vmware:spring_framework:5.3.26:*:*:*:*:*:*:*'
test_best_match_score = '1.0'
test_best_match_score = 0.996033093730958
result = search_cpes(queries=[query])
self.assertEqual(result[query][0][0], test_best_match_cpe)
self.assertEqual(str(result[query][0][1]), test_best_match_score)
self.assertAlmostEqual(result[query][0][1], test_best_match_score)

def test_search_zulip_48(self):
self.maxDiff = None
query = 'Zulip 4.8'
test_best_match_cpe = 'cpe:2.3:a:zulip:zulip:4.8:*:*:*:*:*:*:*'
test_best_match_score = '0.98058067569092'
test_best_match_score = 0.9686485306860276
result = search_cpes(queries=[query])
self.assertEqual(result[query][0][0], test_best_match_cpe)
self.assertEqual(str(result[query][0][1]), test_best_match_score)
self.assertAlmostEqual(result[query][0][1], test_best_match_score)

def test_search_electron_1317(self):
self.maxDiff = None
query = 'Electron 13.1.7'
test_best_match_cpe = 'cpe:2.3:a:electronjs:electron:13.1.7:*:*:*:*:*:*:*'
test_best_match_score = '0.816496580927726'
test_best_match_score = 0.7817733882696567
result = search_cpes(queries=[query])
self.assertEqual(result[query][0][0], test_best_match_cpe)
self.assertEqual(str(result[query][0][1]), test_best_match_score)
self.assertAlmostEqual(result[query][0][1], test_best_match_score)

def test_search_blackice_agent_for_server_30(self):
self.maxDiff = None
query = 'BlackIce Agent for Server 3.0'
test_best_match_cpe = 'cpe:2.3:a:iss:blackice_agent_for_server:3.0:*:*:*:*:*:*:*'
test_best_match_score = '0.9128709291752767'
test_best_match_score = 0.8665018147937851
result = search_cpes(queries=[query])
self.assertEqual(result[query][0][0], test_best_match_cpe)
self.assertEqual(str(result[query][0][1]), test_best_match_score)
self.assertAlmostEqual(result[query][0][1], test_best_match_score)

if __name__ == '__main__':
unittest.main()

0 comments on commit 9c6bd48

Please sign in to comment.