Merge pull request #19 from angelosalatino/v3.3

V3.3
angelosalatino · Dec 23, 2024 · 86a18c8 · 86a18c8
2 parents 4e056e7 + 6aafd93
commit 86a18c8
Show file tree

Hide file tree

Showing 8 changed files with 372 additions and 17 deletions.
diff --git a/CSO-Classifier.ipynb b/CSO-Classifier.ipynb
@@ -70,7 +70,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "cc = CSOClassifier(explanation=True, get_weights=True)\n",
+    "cc = CSOClassifier(explanation=True, get_weights=True, filter_by=[\"computer security\"])\n",
     "\n",
     "result = cc.run(paper)"
    ]
@@ -120,4 +120,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
diff --git a/CSO-Classifier.py b/CSO-Classifier.py
@@ -46,7 +46,7 @@
 # In[Run Classifier]
 
 
-cc = CSOClassifier(explanation=True, get_weights=True)
+cc = CSOClassifier(explanation=True, get_weights=True, filter_by=["computer security"])
 
 result = cc.run(paper)
 

diff --git a/README.md b/README.md
@@ -33,12 +33,15 @@ Read more: [https://skm.kmi.open.ac.uk/cso-classifier/](https://skm.kmi.open.ac.
       - [Sample Input (SP)](#sample-input-sp)
       - [Run (SP)](#run-sp)
       - [Sample Output (SP)](#sample-output-sp)
+      - [Run on Single Paper with filter\_by](#run-on-single-paper-with-filter_by)
+      - [Sample Output when using the filter\_by parameter](#sample-output-when-using-the-filter_by-parameter)
     - [Classifying in batch mode (BM)](#classifying-in-batch-mode-bm)
       - [Sample Input (BM)](#sample-input-bm)
       - [Run (BM)](#run-bm)
       - [Sample Output (BM)](#sample-output-bm)
     - [Parameters](#parameters)
   - [Releases](#releases)
+    - [v3.3](#v33)
     - [v3.2](#v32)
     - [v3.1](#v31)
     - [v3.0](#v30)
@@ -345,6 +348,140 @@ Below you can find an example. The keys syntactic and semantic respectively cont
 }
 ```
 
+#### Run on Single Paper with filter_by
+
+In this example, we will run the CSO Classifier by filtering topics in *computer security* (look at how the ```filter_by``` parameter is set).
+
+```python
+from cso_classifier import CSOClassifier
+cc = CSOClassifier(modules = "both", enhancement = "first", explanation = True, filter_by=["computer security"])
+result = cc.run(paper)
+print(result)
+```
+
+#### Sample Output when using the filter_by parameter
+
+The JSON below it the produced output, and as you can see there 4 additional keys (*filtered_XXXX*) at the bottom containing only a subset of topics within the field of **computer security**.
+
+```json
+{
+    "syntactic": [
+        "real-world networks",
+        "anonymization",
+        "network topology",
+        "data privacy",
+        "social networks",
+        "privacy",
+        "twitter",
+        "graph theory",
+        "online social networks",
+        "anonymity",
+        "data mining",
+        "micro-blog",
+        "sensitive informations"
+    ],
+    "semantic": [
+        "anonymization",
+        "network topology",
+        "topology",
+        "data privacy",
+        "social networks",
+        "privacy",
+        "twitter",
+        "graph theory",
+        "online social networks",
+        "anonymity",
+        "data mining",
+        "micro-blog"
+    ],
+    "union": [
+        "real-world networks",
+        "anonymization",
+        "network topology",
+        "topology",
+        "data privacy",
+        "social networks",
+        "privacy",
+        "twitter",
+        "graph theory",
+        "online social networks",
+        "anonymity",
+        "data mining",
+        "micro-blog",
+        "sensitive informations"
+    ],
+    "enhanced": [
+        "complex networks",
+        "privacy preserving",
+        "computer networks",
+        "world wide web",
+        "computer security",
+        "social media",
+        "theoretical computer science",
+        "online systems",
+        "authentication",
+        "network security",
+        "computer science",
+        "access control"
+    ],
+    "explanation": {
+        "social networks": ["real-world networks", "social networks", "twitter", "social-network", "online social networks", "social network", "microblogging", "social networking"],
+        "online social networks": ["online social networks", "social networks", "social network"],
+        "sensitive informations": ["sensitive information"],
+        "data mining": ["data mining", "mining", "data-mining"],
+        "privacy": ["anonymous", "anonymity", "sensitive information", "data privacy", "privacy"],
+        "anonymization": ["anonymization"],
+        "anonymity": ["anonymous", "anonymity"],
+        "real-world networks": ["real-world networks"],
+        "twitter": ["twitter graph", "anonymous twitter", "microblogging", "microblogging service", "twitter"],
+        "micro-blog": ["twitter graph", "anonymous twitter", "microblogging", "microblogging service", "twitter"],
+        "network topology": ["network topology", "topology"],
+        "data privacy": ["data privacy", "privacy"],
+        "graph theory": ["graph theory"],
+        "topology": ["network topology", "topology"],
+        "complex networks": ["real-world networks"],
+        "privacy preserving": ["anonymization"],
+        "computer networks": ["network topology", "topology"],
+        "world wide web": ["real-world networks", "social networks", "twitter", "social-network", "online social networks", "social network", "microblogging", "social networking"],
+        "computer security": ["anonymous", "anonymity", "sensitive information", "data privacy", "privacy"],
+        "social media": ["microblogging", "twitter"],
+        "theoretical computer science": ["graph theory"],
+        "online systems": ["online social networks", "social networks", "social network"],
+        "authentication": ["anonymous", "anonymity"],
+        "network security": ["anonymous", "anonymity", "sensitive information"],
+        "computer science": ["data mining", "mining", "data-mining"],
+        "access control": ["sensitive information"]
+    },
+    "filtered_syntactic": [
+        "anonymization",
+        "data privacy",
+        "privacy",
+        "anonymity",
+        "sensitive informations"
+    ],
+    "filtered_semantic": [
+        "anonymization",
+        "data privacy",
+        "privacy",
+        "anonymity"
+    ],
+    "filtered_union": [
+        "anonymization",
+        "data privacy",
+        "privacy",
+        "anonymity",
+        "sensitive informations"
+    ],
+    "filtered_enhanced": [
+        "privacy preserving",
+        "computer security",
+        "authentication",
+        "network security",
+        "access control"
+    ]
+}
+```
+
 ### Classifying in batch mode (BM)
 
 #### Sample Input (BM)
@@ -432,7 +569,7 @@ Below you can find an example. The keys syntactic and semantic respectively cont
 ```
 
 ### Parameters
-Beside the paper(s), the function running the CSO Classifier accepts seven additional parameters: (i) **workers**, (ii) **modules**, (iii) **enhancement**, (iv) **explanation**, (v) **delete_outliers**, (vi) **fast_classification**, and (vii) **silent**. There is no particular order on how to specify these paramaters. Here we explain their usage. The workers parameters is an integer (equal or greater than 1), modules and enhancement are strings that define a particular behaviour for the classifier. The explanation, delete_outliers, fast_classification, and silent parameters are booleans.
+Beside the paper(s), the function running the CSO Classifier accepts seven additional parameters: (i) **workers**, (ii) **modules**, (iii) **enhancement**, (iv) **explanation**, (v) **delete_outliers**, (vi) **fast_classification**, (vii) **silent**, and (ix) **filter_by**. There is no particular order on how to specify these paramaters. Here we explain their usage. The workers parameters is an integer (equal or greater than 1), modules and enhancement are strings that define a particular behaviour for the classifier. The explanation, delete_outliers, fast_classification, and silent parameters are booleans. Finally, filter_by is a list 
 
 (i) The parameter *workers* defines the number of threads to run for classifying the input corpus. For instance, if ```workers = 4```, there will be 4 instances of the CSO Classifier, each one receiving a chunk (equally split) of the corpus to process. Once all processes are completed, the results will be aggregated and returned. The default value for *workers* is *1*. This parameter is available only when running the classifier in *batch mode*.
 
@@ -450,6 +587,8 @@ Beside the paper(s), the function running the CSO Classifier accepts seven addit
 
 (viii) The parameter *silent* can be either *True* or *False*. This determines whether the classifier prints its progress in the console. If set to True, the classifier will be silent and will not print any output while classifying. The default value for *silent* is *False*.
 
+(ix) The parameter *filter_by* is a list, containing CSO topic, and lets you focus the classification on specific sub-branches of CSO. For instance, to narrow down the results to subtopics within **artificial intelligence** and **semantic web** you can set ```filter_by = ["artificial intelligence", "semantic web"]```. This will produce four extra outputs (*syntactic_filtered*, *semantic_filtered*, *union_filtered*, *enhanced_filtered*) containing only the CSO topics that fall under the hierarchical structure of the specified areas. By default this parameter is an empty list, and therefore the classifier will consider all CSO topics as usual. You can check [Run on Single Paper with filter\_by](#run-on-single-paper-with-filter_by) to see how it works.
+
 
 
 |# | Parameter  |  Single Paper | Batch Mode |
@@ -462,14 +601,20 @@ Beside the paper(s), the function running the CSO Classifier accepts seven addit
 |vi | fast_classification| :white_check_mark:  | :white_check_mark: |
 |vii| get_weights       | :white_check_mark:  | :white_check_mark: |
 |viii| silent       | :white_check_mark:  | :white_check_mark: |
+|ix| filter_by       | :white_check_mark:  | :white_check_mark: |
 
 
 **Table 1**: Parameters availability when using CSO Classifier
 
 
 ## Releases
 
-Here we list the available releases for the CSO Classifier. These releases are available for download both from [Github](https://github.com/angelosalatino/cso-classifier/releases) and [Zenodo](10.5281/zenodo.2660819).
+Here we list the available releases for the CSO Classifier. These releases are available for download both from [Github](https://github.com/angelosalatino/cso-classifier/releases) and [Zenodo](http://doi.org/10.5281/zenodo.2660819).
+
+### v3.3
+
+This release extends version 3.2 with a new feature that lets you refine the classification process by focusing on specific areas within the Computer Science Ontology. Specifically, providing one or more topics within the parameter *filter_by* (type list), the classifier will extract the sub-branches of such CSO topics, and when classifying will narrow down the output to the only sub-topics available in those areas. This is especially helpful when you are interested in exploring specific branches of the CSO, such as identifying only the concepts related to **artificial intelligence** and **semantic web** within a given paper, and can be achieved by setting ```filter_by = ["artificial intelligence", "semantic web"]``` (see [Parameters](#parameters)). If this parameter is set, the classifier will return the standard classification results, with four extra sets of results (*syntactic_filtered*, *semantic_filtered*, *union_filtered*, *enhanced_filtered*) containing only the filtered topics. This gives users the full picture and a focused view within the chosen areas.
+
 
 ### v3.2
 

diff --git a/cso_classifier/classifier.py b/cso_classifier/classifier.py
@@ -39,15 +39,21 @@ def __init__(self, **parameters):
                     True to return weights. Default value is False
             - silent (boolean): determines whether to print the progress. If true goes in silent mode.
                     Instead, if false does not print anything in standard output.
+            - filter_by (list): determines whether the output should be filtered accoring to certain branches of CSO. Please note, 
+                    this will not filter the regular result set, but rather return an additional key with filtered topics
 
         """
         self.modules             = parameters["modules"] if "modules" in parameters else "both"
         self.enhancement         = parameters["enhancement"] if "enhancement" in parameters else "first"
         self.explanation         = parameters["explanation"] if "explanation" in parameters else False
         self.delete_outliers     = parameters["delete_outliers"] if "delete_outliers" in parameters else True
         self.fast_classification = parameters["fast_classification"] if "fast_classification" in parameters else True
-        self.silent              = parameters["silent"] if "silent" in parameters else False
         self.get_weights         = parameters["get_weights"] if "get_weights" in parameters else False
+        self.silent              = parameters["silent"] if "silent" in parameters else False
+
+        self.filter_output       = True if "filter_by" in parameters else False
+        self.filter_by           = parameters["filter_by"] if "filter_by" in parameters else []
+
 
         self.__check_parameters(parameters)
 
@@ -84,7 +90,10 @@ def run(self, paper):
             self.models_loaded = True
 
         t_paper = Paper(paper, self.modules)
-        result = Result(self.explanation, self.get_weights)
+        result = Result(self.explanation, self.get_weights, self.filter_output)
+
+
+
 
 
         # Passing parameters to the two classes (synt and sema) and actioning classifiers
@@ -106,8 +115,14 @@ def run(self, paper):
                 result.dump_temporary_explanation(sema_module.get_explanation())
 
 
-        postprocess = post(self.model, self.cso, enhancement=self.enhancement, result=result, delete_outliers=self.delete_outliers, get_weights=self.get_weights)
-        result = postprocess.filtering_outliers()
+        postprocess = post(self.model, 
+                           self.cso, 
+                           enhancement=self.enhancement, 
+                           result=result, 
+                           delete_outliers=self.delete_outliers, 
+                           get_weights=self.get_weights,
+                           filter_by=self.filter_by)
+        result = postprocess.process()
 
         return result.get_dict()
 
@@ -175,7 +190,12 @@ def _batch_run_single_worker(self, papers):
         # Passing parameters to the two classes (synt and sema)
         synt_module = synt(cso)
         sema_module = sema(model, cso, self.fast_classification)
-        postprocess = post(model, cso, enhancement=self.enhancement, delete_outliers=self.delete_outliers, get_weights=self.get_weights)
+        postprocess = post(model, 
+                           cso, 
+                           enhancement=self.enhancement, 
+                           delete_outliers=self.delete_outliers, 
+                           get_weights=self.get_weights, 
+                           filter_by=self.filter_by)
 
 
         # initializing variable that will contain output
@@ -186,7 +206,7 @@ def _batch_run_single_worker(self, papers):
                 print("Processing:", paper_id)
 
             paper.set_paper(paper_value)
-            result = Result(self.explanation, self.get_weights)
+            result = Result(self.explanation, self.get_weights, self.filter_output)
 
             # Passing paper and actioning the classifier
             if self.modules in ('syntactic','both'):
@@ -205,7 +225,7 @@ def _batch_run_single_worker(self, papers):
                     result.dump_temporary_explanation(sema_module.get_explanation())
 
             postprocess.set_result(result)
-            result = postprocess.filtering_outliers()
+            result = postprocess.process()
 
             class_res[paper_id] = result.get_dict()
         return class_res
@@ -232,10 +252,18 @@ def __check_parameters(self, parameters):
         if "fast_classification" in parameters:
             if not isinstance(parameters["fast_classification"], bool):
                 raise TypeError("Field fast_classification must be set to either True or False. Got %s instead." % type(parameters["fast_classification"]).__name__)
-
+
+        if "get_weights" in parameters:
+            if not isinstance(parameters["get_weights"], bool):
+                raise TypeError("Field get_weights must be set to either True or False. Got %s instead." % type(parameters["get_weights"]).__name__)
+
         if "silent" in parameters:
             if not isinstance(parameters["silent"], bool):
                 raise TypeError("Field silent must be set to either True or False. Got %s instead." % type(parameters["silent"]).__name__)
+
+        if "filter_by" in parameters:
+            if not isinstance(parameters["filter_by"], list):
+                raise TypeError("Field filter_by must be a list of strings. Got %s instead." % type(parameters["filter_by"]).__name__)
 
 
     @staticmethod