@inproceedings{BuesgenKloeserKohletal.2022, author = {B{\"u}sgen, Andr{\´e} and Kl{\"o}ser, Lars and Kohl, Philipp and Schmidts, Oliver and Kraft, Bodo and Z{\"u}ndorf, Albert}, title = {Exploratory analysis of chat-based black market profiles with natural language processing}, series = {Proceedings of the 11th International Conference on Data Science, Technology and Applications}, booktitle = {Proceedings of the 11th International Conference on Data Science, Technology and Applications}, isbn = {978-989-758-583-8}, issn = {2184-285X}, doi = {10.5220/0011271400003269}, pages = {83 -- 94}, year = {2022}, abstract = {Messenger apps like WhatsApp or Telegram are an integral part of daily communication. Besides the various positive effects, those services extend the operating range of criminals. Open trading groups with many thousand participants emerged on Telegram. Law enforcement agencies monitor suspicious users in such chat rooms. This research shows that text analysis, based on natural language processing, facilitates this through a meaningful domain overview and detailed investigations. We crawled a corpus from such self-proclaimed black markets and annotated five attribute types products, money, payment methods, user names, and locations. Based on each message a user sends, we extract and group these attributes to build profiles. Then, we build features to cluster the profiles. Pretrained word vectors yield better unsupervised clustering results than current state-of-the-art transformer models. The result is a semantically meaningful high-level overview of the user landscape of black market chatrooms. Additionally, the extracted structured information serves as a foundation for further data exploration, for example, the most active users or preferred payment methods.}, language = {en} } @inproceedings{BuesgenKloeserKohletal.2023, author = {B{\"u}sgen, Andr{\´e} and Kl{\"o}ser, Lars and Kohl, Philipp and Schmidts, Oliver and Kraft, Bodo and Z{\"u}ndorf, Albert}, title = {From cracked accounts to fake IDs: user profiling on German telegram black market channels}, series = {Data Management Technologies and Applications}, booktitle = {Data Management Technologies and Applications}, editor = {Cuzzocrea, Alfredo and Gusikhin, Oleg and Hammoudi, Slimane and Quix, Christoph}, publisher = {Springer}, address = {Cham}, isbn = {978-3-031-37889-8 (Print)}, doi = {10.1007/978-3-031-37890-4_9}, pages = {176 -- 202}, year = {2023}, abstract = {Messenger apps like WhatsApp and Telegram are frequently used for everyday communication, but they can also be utilized as a platform for illegal activity. Telegram allows public groups with up to 200.000 participants. Criminals use these public groups for trading illegal commodities and services, which becomes a concern for law enforcement agencies, who manually monitor suspicious activity in these chat rooms. This research demonstrates how natural language processing (NLP) can assist in analyzing these chat rooms, providing an explorative overview of the domain and facilitating purposeful analyses of user behavior. We provide a publicly available corpus of annotated text messages with entities and relations from four self-proclaimed black market chat rooms. Our pipeline approach aggregates the extracted product attributes from user messages to profiles and uses these with their sold products as features for clustering. The extracted structured information is the foundation for further data exploration, such as identifying the top vendors or fine-granular price analyses. Our evaluation shows that pretrained word vectors perform better for unsupervised clustering than state-of-the-art transformer models, while the latter is still superior for sequence labeling.}, language = {en} } @inproceedings{KloeserBuesgenKohletal.2023, author = {Kl{\"o}ser, Lars and B{\"u}sgen, Andr{\´e} and Kohl, Philipp and Kraft, Bodo and Z{\"u}ndorf, Albert}, title = {Explaining relation classification models with semantic extents}, series = {DeLTA 2023: Deep Learning Theory and Applications}, booktitle = {DeLTA 2023: Deep Learning Theory and Applications}, editor = {Conte, Donatello and Fred, Ana and Gusikhin, Oleg and Sansone, Carlo}, publisher = {Springer}, address = {Cham}, isbn = {978-3-031-39058-6 (Print)}, doi = {10.1007/978-3-031-39059-3_13}, pages = {189 -- 208}, year = {2023}, abstract = {In recent years, the development of large pretrained language models, such as BERT and GPT, significantly improved information extraction systems on various tasks, including relation classification. State-of-the-art systems are highly accurate on scientific benchmarks. A lack of explainability is currently a complicating factor in many real-world applications. Comprehensible systems are necessary to prevent biased, counterintuitive, or harmful decisions. We introduce semantic extents, a concept to analyze decision patterns for the relation classification task. Semantic extents are the most influential parts of texts concerning classification decisions. Our definition allows similar procedures to determine semantic extents for humans and models. We provide an annotation tool and a software framework to determine semantic extents for humans and models conveniently and reproducibly. Comparing both reveals that models tend to learn shortcut patterns from data. These patterns are hard to detect with current interpretability methods, such as input reductions. Our approach can help detect and eliminate spurious decision patterns during model development. Semantic extents can increase the reliability and security of natural language processing systems. Semantic extents are an essential step in enabling applications in critical areas like healthcare or finance. Moreover, our work opens new research directions for developing methods to explain deep learning models.}, language = {en} } @inproceedings{KloeserKohlKraftetal.2021, author = {Kl{\"o}ser, Lars and Kohl, Philipp and Kraft, Bodo and Z{\"u}ndorf, Albert}, title = {Multi-attribute relation extraction (MARE): simplifying the application of relation extraction}, series = {Proceedings of the 2nd International Conference on Deep Learning Theory and Applications - DeLTA}, booktitle = {Proceedings of the 2nd International Conference on Deep Learning Theory and Applications - DeLTA}, isbn = {978-989-758-526-5}, doi = {10.5220/0010559201480156}, pages = {148 -- 156}, year = {2021}, abstract = {Natural language understanding's relation extraction makes innovative and encouraging novel business concepts possible and facilitates new digitilized decision-making processes. Current approaches allow the extraction of relations with a fixed number of entities as attributes. Extracting relations with an arbitrary amount of attributes requires complex systems and costly relation-trigger annotations to assist these systems. We introduce multi-attribute relation extraction (MARE) as an assumption-less problem formulation with two approaches, facilitating an explicit mapping from business use cases to the data annotations. Avoiding elaborated annotation constraints simplifies the application of relation extraction approaches. The evaluation compares our models to current state-of-the-art event extraction and binary relation extraction methods. Our approaches show improvement compared to these on the extraction of general multi-attribute relations.}, language = {en} } @inproceedings{KohlFreyerKraemeretal.2023, author = {Kohl, Philipp and Freyer, Nils and Kr{\"a}mer, Yoka and Werth, Henri and Wolf, Steffen and Kraft, Bodo and Meinecke, Matthias and Z{\"u}ndorf, Albert}, title = {ALE: a simulation-based active learning evaluation framework for the parameter-driven comparison of query strategies for NLP}, series = {Deep Learning Theory and Applications. DeLTA 2023. Communications in Computer and Information Science}, booktitle = {Deep Learning Theory and Applications. DeLTA 2023. Communications in Computer and Information Science}, editor = {Conte, Donatello and Fred, Ana and Gusikhin, Oleg and Sansone, Carlo}, publisher = {Springer}, address = {Cham}, isbn = {978-3-031-39058-6 (Print)}, doi = {978-3-031-39059-3}, pages = {235 -- 253}, year = {2023}, abstract = {Supervised machine learning and deep learning require a large amount of labeled data, which data scientists obtain in a manual, and time-consuming annotation process. To mitigate this challenge, Active Learning (AL) proposes promising data points to annotators they annotate next instead of a subsequent or random sample. This method is supposed to save annotation effort while maintaining model performance. However, practitioners face many AL strategies for different tasks and need an empirical basis to choose between them. Surveys categorize AL strategies into taxonomies without performance indications. Presentations of novel AL strategies compare the performance to a small subset of strategies. Our contribution addresses the empirical basis by introducing a reproducible active learning evaluation (ALE) framework for the comparative evaluation of AL strategies in NLP. The framework allows the implementation of AL strategies with low effort and a fair data-driven comparison through defining and tracking experiment parameters (e.g., initial dataset size, number of data points per query step, and the budget). ALE helps practitioners to make more informed decisions, and researchers can focus on developing new, effective AL strategies and deriving best practices for specific use cases. With best practices, practitioners can lower their annotation costs. We present a case study to illustrate how to use the framework.}, language = {en} } @inproceedings{KohlSchmidtsKloeseretal.2021, author = {Kohl, Philipp and Schmidts, Oliver and Kl{\"o}ser, Lars and Werth, Henri and Kraft, Bodo and Z{\"u}ndorf, Albert}, title = {STAMP 4 NLP - an agile framework for rapid quality-driven NLP applications development}, series = {Quality of Information and Communications Technology. QUATIC 2021}, booktitle = {Quality of Information and Communications Technology. QUATIC 2021}, publisher = {Springer}, address = {Cham}, isbn = {978-3-030-85346-4}, doi = {10.1007/978-3-030-85347-1_12}, pages = {156 -- 166}, year = {2021}, abstract = {The progress in natural language processing (NLP) research over the last years, offers novel business opportunities for companies, as automated user interaction or improved data analysis. Building sophisticated NLP applications requires dealing with modern machine learning (ML) technologies, which impedes enterprises from establishing successful NLP projects. Our experience in applied NLP research projects shows that the continuous integration of research prototypes in production-like environments with quality assurance builds trust in the software and shows convenience and usefulness regarding the business goal. We introduce STAMP 4 NLP as an iterative and incremental process model for developing NLP applications. With STAMP 4 NLP, we merge software engineering principles with best practices from data science. Instantiating our process model allows efficiently creating prototypes by utilizing templates, conventions, and implementations, enabling developers and data scientists to focus on the business goals. Due to our iterative-incremental approach, businesses can deploy an enhanced version of the prototype to their software environment after every iteration, maximizing potential business value and trust early and avoiding the cost of successful yet never deployed experiments.}, language = {en} } @inproceedings{SchmidtsKraftSiebigterothetal.2019, author = {Schmidts, Oliver and Kraft, Bodo and Siebigteroth, Ines and Z{\"u}ndorf, Albert}, title = {Schema Matching with Frequent Changes on Semi-Structured Input Files: A Machine Learning Approach on Biological Product Data}, series = {Proceedings of the 21st International Conference on Enterprise Information Systems - Volume 1: ICEIS}, booktitle = {Proceedings of the 21st International Conference on Enterprise Information Systems - Volume 1: ICEIS}, isbn = {978-989-758-372-8}, doi = {10.5220/0007723602080215}, pages = {208 -- 215}, year = {2019}, language = {en} } @inproceedings{SchmidtsKraftWinkensetal.2020, author = {Schmidts, Oliver and Kraft, Bodo and Winkens, Marvin and Z{\"u}ndorf, Albert}, title = {Catalog integration of low-quality product data by attribute label ranking}, series = {Proceedings of the 9th International Conference on Data Science, Technology and Applications - Volume 1: DATA}, booktitle = {Proceedings of the 9th International Conference on Data Science, Technology and Applications - Volume 1: DATA}, isbn = {978-989-758-440-4}, doi = {10.5220/0009831000900101}, pages = {90 -- 101}, year = {2020}, language = {en} } @inproceedings{SchmidtsKraftWinkensetal.2021, author = {Schmidts, Oliver and Kraft, Bodo and Winkens, Marvin and Z{\"u}ndorf, Albert}, title = {Catalog integration of heterogeneous and volatile product data}, series = {DATA 2020: Data Management Technologies and Applications}, booktitle = {DATA 2020: Data Management Technologies and Applications}, publisher = {Springer}, address = {Cham}, isbn = {978-3-030-83013-7}, doi = {10.1007/978-3-030-83014-4_7}, pages = {134 -- 153}, year = {2021}, abstract = {The integration of frequently changing, volatile product data from different manufacturers into a single catalog is a significant challenge for small and medium-sized e-commerce companies. They rely on timely integrating product data to present them aggregated in an online shop without knowing format specifications, concept understanding of manufacturers, and data quality. Furthermore, format, concepts, and data quality may change at any time. Consequently, integrating product catalogs into a single standardized catalog is often a laborious manual task. Current strategies to streamline or automate catalog integration use techniques based on machine learning, word vectorization, or semantic similarity. However, most approaches struggle with low-quality or real-world data. We propose Attribute Label Ranking (ALR) as a recommendation engine to simplify the integration process of previously unknown, proprietary tabular format into a standardized catalog for practitioners. We evaluate ALR by focusing on the impact of different neural network architectures, language features, and semantic similarity. Additionally, we consider metrics for industrial application and present the impact of ALR in production and its limitations.}, language = {en} } @inproceedings{SchreiberKraftZuendorf2016, author = {Schreiber, Marc and Kraft, Bodo and Z{\"u}ndorf, Albert}, title = {Cost-efficient quality assurance of natural language processing tools through continuous monitoring with continuous integration}, series = {3rd International Workshop on Software Engineering Research and Industrial Practice}, booktitle = {3rd International Workshop on Software Engineering Research and Industrial Practice}, doi = {10.1145/2897022.2897029}, pages = {46 -- 52}, year = {2016}, language = {en} }