Skip to content

Commit

Permalink
Merge pull request #19 from angelosalatino/v3.3
Browse files Browse the repository at this point in the history
V3.3
  • Loading branch information
angelosalatino authored Dec 23, 2024
2 parents 4e056e7 + 6aafd93 commit 86a18c8
Show file tree
Hide file tree
Showing 8 changed files with 372 additions and 17 deletions.
4 changes: 2 additions & 2 deletions CSO-Classifier.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@
"metadata": {},
"outputs": [],
"source": [
"cc = CSOClassifier(explanation=True, get_weights=True)\n",
"cc = CSOClassifier(explanation=True, get_weights=True, filter_by=[\"computer security\"])\n",
"\n",
"result = cc.run(paper)"
]
Expand Down Expand Up @@ -120,4 +120,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}
2 changes: 1 addition & 1 deletion CSO-Classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
# In[Run Classifier]


cc = CSOClassifier(explanation=True, get_weights=True)
cc = CSOClassifier(explanation=True, get_weights=True, filter_by=["computer security"])

result = cc.run(paper)

Expand Down
149 changes: 147 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,15 @@ Read more: [https://skm.kmi.open.ac.uk/cso-classifier/](https://skm.kmi.open.ac.
- [Sample Input (SP)](#sample-input-sp)
- [Run (SP)](#run-sp)
- [Sample Output (SP)](#sample-output-sp)
- [Run on Single Paper with filter\_by](#run-on-single-paper-with-filter_by)
- [Sample Output when using the filter\_by parameter](#sample-output-when-using-the-filter_by-parameter)
- [Classifying in batch mode (BM)](#classifying-in-batch-mode-bm)
- [Sample Input (BM)](#sample-input-bm)
- [Run (BM)](#run-bm)
- [Sample Output (BM)](#sample-output-bm)
- [Parameters](#parameters)
- [Releases](#releases)
- [v3.3](#v33)
- [v3.2](#v32)
- [v3.1](#v31)
- [v3.0](#v30)
Expand Down Expand Up @@ -345,6 +348,140 @@ Below you can find an example. The keys syntactic and semantic respectively cont
}
```

#### Run on Single Paper with filter_by

In this example, we will run the CSO Classifier by filtering topics in *computer security* (look at how the ```filter_by``` parameter is set).

```python
from cso_classifier import CSOClassifier
cc = CSOClassifier(modules = "both", enhancement = "first", explanation = True, filter_by=["computer security"])
result = cc.run(paper)
print(result)
```

#### Sample Output when using the filter_by parameter

The JSON below it the produced output, and as you can see there 4 additional keys (*filtered_XXXX*) at the bottom containing only a subset of topics within the field of **computer security**.

```json
{
"syntactic": [
"real-world networks",
"anonymization",
"network topology",
"data privacy",
"social networks",
"privacy",
"twitter",
"graph theory",
"online social networks",
"anonymity",
"data mining",
"micro-blog",
"sensitive informations"
],
"semantic": [
"anonymization",
"network topology",
"topology",
"data privacy",
"social networks",
"privacy",
"twitter",
"graph theory",
"online social networks",
"anonymity",
"data mining",
"micro-blog"
],
"union": [
"real-world networks",
"anonymization",
"network topology",
"topology",
"data privacy",
"social networks",
"privacy",
"twitter",
"graph theory",
"online social networks",
"anonymity",
"data mining",
"micro-blog",
"sensitive informations"
],
"enhanced": [
"complex networks",
"privacy preserving",
"computer networks",
"world wide web",
"computer security",
"social media",
"theoretical computer science",
"online systems",
"authentication",
"network security",
"computer science",
"access control"
],
"explanation": {
"social networks": ["real-world networks", "social networks", "twitter", "social-network", "online social networks", "social network", "microblogging", "social networking"],
"online social networks": ["online social networks", "social networks", "social network"],
"sensitive informations": ["sensitive information"],
"data mining": ["data mining", "mining", "data-mining"],
"privacy": ["anonymous", "anonymity", "sensitive information", "data privacy", "privacy"],
"anonymization": ["anonymization"],
"anonymity": ["anonymous", "anonymity"],
"real-world networks": ["real-world networks"],
"twitter": ["twitter graph", "anonymous twitter", "microblogging", "microblogging service", "twitter"],
"micro-blog": ["twitter graph", "anonymous twitter", "microblogging", "microblogging service", "twitter"],
"network topology": ["network topology", "topology"],
"data privacy": ["data privacy", "privacy"],
"graph theory": ["graph theory"],
"topology": ["network topology", "topology"],
"complex networks": ["real-world networks"],
"privacy preserving": ["anonymization"],
"computer networks": ["network topology", "topology"],
"world wide web": ["real-world networks", "social networks", "twitter", "social-network", "online social networks", "social network", "microblogging", "social networking"],
"computer security": ["anonymous", "anonymity", "sensitive information", "data privacy", "privacy"],
"social media": ["microblogging", "twitter"],
"theoretical computer science": ["graph theory"],
"online systems": ["online social networks", "social networks", "social network"],
"authentication": ["anonymous", "anonymity"],
"network security": ["anonymous", "anonymity", "sensitive information"],
"computer science": ["data mining", "mining", "data-mining"],
"access control": ["sensitive information"]
},
"filtered_syntactic": [
"anonymization",
"data privacy",
"privacy",
"anonymity",
"sensitive informations"
],
"filtered_semantic": [
"anonymization",
"data privacy",
"privacy",
"anonymity"
],
"filtered_union": [
"anonymization",
"data privacy",
"privacy",
"anonymity",
"sensitive informations"
],
"filtered_enhanced": [
"privacy preserving",
"computer security",
"authentication",
"network security",
"access control"
]
}
```

### Classifying in batch mode (BM)

#### Sample Input (BM)
Expand Down Expand Up @@ -432,7 +569,7 @@ Below you can find an example. The keys syntactic and semantic respectively cont
```

### Parameters
Beside the paper(s), the function running the CSO Classifier accepts seven additional parameters: (i) **workers**, (ii) **modules**, (iii) **enhancement**, (iv) **explanation**, (v) **delete_outliers**, (vi) **fast_classification**, and (vii) **silent**. There is no particular order on how to specify these paramaters. Here we explain their usage. The workers parameters is an integer (equal or greater than 1), modules and enhancement are strings that define a particular behaviour for the classifier. The explanation, delete_outliers, fast_classification, and silent parameters are booleans.
Beside the paper(s), the function running the CSO Classifier accepts seven additional parameters: (i) **workers**, (ii) **modules**, (iii) **enhancement**, (iv) **explanation**, (v) **delete_outliers**, (vi) **fast_classification**, (vii) **silent**, and (ix) **filter_by**. There is no particular order on how to specify these paramaters. Here we explain their usage. The workers parameters is an integer (equal or greater than 1), modules and enhancement are strings that define a particular behaviour for the classifier. The explanation, delete_outliers, fast_classification, and silent parameters are booleans. Finally, filter_by is a list

(i) The parameter *workers* defines the number of threads to run for classifying the input corpus. For instance, if ```workers = 4```, there will be 4 instances of the CSO Classifier, each one receiving a chunk (equally split) of the corpus to process. Once all processes are completed, the results will be aggregated and returned. The default value for *workers* is *1*. This parameter is available only when running the classifier in *batch mode*.

Expand All @@ -450,6 +587,8 @@ Beside the paper(s), the function running the CSO Classifier accepts seven addit

(viii) The parameter *silent* can be either *True* or *False*. This determines whether the classifier prints its progress in the console. If set to True, the classifier will be silent and will not print any output while classifying. The default value for *silent* is *False*.

(ix) The parameter *filter_by* is a list, containing CSO topic, and lets you focus the classification on specific sub-branches of CSO. For instance, to narrow down the results to subtopics within **artificial intelligence** and **semantic web** you can set ```filter_by = ["artificial intelligence", "semantic web"]```. This will produce four extra outputs (*syntactic_filtered*, *semantic_filtered*, *union_filtered*, *enhanced_filtered*) containing only the CSO topics that fall under the hierarchical structure of the specified areas. By default this parameter is an empty list, and therefore the classifier will consider all CSO topics as usual. You can check [Run on Single Paper with filter\_by](#run-on-single-paper-with-filter_by) to see how it works.



|# | Parameter | Single Paper | Batch Mode |
Expand All @@ -462,14 +601,20 @@ Beside the paper(s), the function running the CSO Classifier accepts seven addit
|vi | fast_classification| :white_check_mark: | :white_check_mark: |
|vii| get_weights | :white_check_mark: | :white_check_mark: |
|viii| silent | :white_check_mark: | :white_check_mark: |
|ix| filter_by | :white_check_mark: | :white_check_mark: |


**Table 1**: Parameters availability when using CSO Classifier


## Releases

Here we list the available releases for the CSO Classifier. These releases are available for download both from [Github](https://github.com/angelosalatino/cso-classifier/releases) and [Zenodo](10.5281/zenodo.2660819).
Here we list the available releases for the CSO Classifier. These releases are available for download both from [Github](https://github.com/angelosalatino/cso-classifier/releases) and [Zenodo](http://doi.org/10.5281/zenodo.2660819).

### v3.3

This release extends version 3.2 with a new feature that lets you refine the classification process by focusing on specific areas within the Computer Science Ontology. Specifically, providing one or more topics within the parameter *filter_by* (type list), the classifier will extract the sub-branches of such CSO topics, and when classifying will narrow down the output to the only sub-topics available in those areas. This is especially helpful when you are interested in exploring specific branches of the CSO, such as identifying only the concepts related to **artificial intelligence** and **semantic web** within a given paper, and can be achieved by setting ```filter_by = ["artificial intelligence", "semantic web"]``` (see [Parameters](#parameters)). If this parameter is set, the classifier will return the standard classification results, with four extra sets of results (*syntactic_filtered*, *semantic_filtered*, *union_filtered*, *enhanced_filtered*) containing only the filtered topics. This gives users the full picture and a focused view within the chosen areas.


### v3.2

Expand Down
44 changes: 36 additions & 8 deletions cso_classifier/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,21 @@ def __init__(self, **parameters):
True to return weights. Default value is False
- silent (boolean): determines whether to print the progress. If true goes in silent mode.
Instead, if false does not print anything in standard output.
- filter_by (list): determines whether the output should be filtered accoring to certain branches of CSO. Please note,
this will not filter the regular result set, but rather return an additional key with filtered topics
"""
self.modules = parameters["modules"] if "modules" in parameters else "both"
self.enhancement = parameters["enhancement"] if "enhancement" in parameters else "first"
self.explanation = parameters["explanation"] if "explanation" in parameters else False
self.delete_outliers = parameters["delete_outliers"] if "delete_outliers" in parameters else True
self.fast_classification = parameters["fast_classification"] if "fast_classification" in parameters else True
self.silent = parameters["silent"] if "silent" in parameters else False
self.get_weights = parameters["get_weights"] if "get_weights" in parameters else False
self.silent = parameters["silent"] if "silent" in parameters else False

self.filter_output = True if "filter_by" in parameters else False
self.filter_by = parameters["filter_by"] if "filter_by" in parameters else []


self.__check_parameters(parameters)

Expand Down Expand Up @@ -84,7 +90,10 @@ def run(self, paper):
self.models_loaded = True

t_paper = Paper(paper, self.modules)
result = Result(self.explanation, self.get_weights)
result = Result(self.explanation, self.get_weights, self.filter_output)





# Passing parameters to the two classes (synt and sema) and actioning classifiers
Expand All @@ -106,8 +115,14 @@ def run(self, paper):
result.dump_temporary_explanation(sema_module.get_explanation())


postprocess = post(self.model, self.cso, enhancement=self.enhancement, result=result, delete_outliers=self.delete_outliers, get_weights=self.get_weights)
result = postprocess.filtering_outliers()
postprocess = post(self.model,
self.cso,
enhancement=self.enhancement,
result=result,
delete_outliers=self.delete_outliers,
get_weights=self.get_weights,
filter_by=self.filter_by)
result = postprocess.process()

return result.get_dict()

Expand Down Expand Up @@ -175,7 +190,12 @@ def _batch_run_single_worker(self, papers):
# Passing parameters to the two classes (synt and sema)
synt_module = synt(cso)
sema_module = sema(model, cso, self.fast_classification)
postprocess = post(model, cso, enhancement=self.enhancement, delete_outliers=self.delete_outliers, get_weights=self.get_weights)
postprocess = post(model,
cso,
enhancement=self.enhancement,
delete_outliers=self.delete_outliers,
get_weights=self.get_weights,
filter_by=self.filter_by)


# initializing variable that will contain output
Expand All @@ -186,7 +206,7 @@ def _batch_run_single_worker(self, papers):
print("Processing:", paper_id)

paper.set_paper(paper_value)
result = Result(self.explanation, self.get_weights)
result = Result(self.explanation, self.get_weights, self.filter_output)

# Passing paper and actioning the classifier
if self.modules in ('syntactic','both'):
Expand All @@ -205,7 +225,7 @@ def _batch_run_single_worker(self, papers):
result.dump_temporary_explanation(sema_module.get_explanation())

postprocess.set_result(result)
result = postprocess.filtering_outliers()
result = postprocess.process()

class_res[paper_id] = result.get_dict()
return class_res
Expand All @@ -232,10 +252,18 @@ def __check_parameters(self, parameters):
if "fast_classification" in parameters:
if not isinstance(parameters["fast_classification"], bool):
raise TypeError("Field fast_classification must be set to either True or False. Got %s instead." % type(parameters["fast_classification"]).__name__)


if "get_weights" in parameters:
if not isinstance(parameters["get_weights"], bool):
raise TypeError("Field get_weights must be set to either True or False. Got %s instead." % type(parameters["get_weights"]).__name__)

if "silent" in parameters:
if not isinstance(parameters["silent"], bool):
raise TypeError("Field silent must be set to either True or False. Got %s instead." % type(parameters["silent"]).__name__)

if "filter_by" in parameters:
if not isinstance(parameters["filter_by"], list):
raise TypeError("Field filter_by must be a list of strings. Got %s instead." % type(parameters["filter_by"]).__name__)


@staticmethod
Expand Down
Loading

0 comments on commit 86a18c8

Please sign in to comment.