From b4c345e6a5fa929ba20eac19183b9c777055f52d Mon Sep 17 00:00:00 2001 From: Gaspard Coulet Date: Wed, 28 Apr 2021 23:12:36 +0200 Subject: Initial commit --- TrainingSets/spam.txt | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 TrainingSets/spam.txt (limited to 'TrainingSets/spam.txt') diff --git a/TrainingSets/spam.txt b/TrainingSets/spam.txt new file mode 100644 index 0000000..0ff26ac --- /dev/null +++ b/TrainingSets/spam.txt @@ -0,0 +1,90 @@ +| SPAM E-MAIL DATABASE ATTRIBUTES (in .names format) +| +| 48 continuous real [0,100] attributes of type word_freq_WORD +| = percentage of words in the e-mail that match WORD, +| i.e. 100 * (number of times the WORD appears in the e-mail) / +| total number of words in e-mail. A "word" in this case is any +| string of alphanumeric characters bounded by non-alphanumeric +| characters or end-of-string. +| +| 6 continuous real [0,100] attributes of type char_freq_CHAR +| = percentage of characters in the e-mail that match CHAR, +| i.e. 100 * (number of CHAR occurences) / total characters in e-mail +| +| 1 continuous real [1,...] attribute of type capital_run_length_average +| = average length of uninterrupted sequences of capital letters +| +| 1 continuous integer [1,...] attribute of type capital_run_length_longest +| = length of longest uninterrupted sequence of capital letters +| +| 1 continuous integer [1,...] attribute of type capital_run_length_total +| = sum of length of uninterrupted sequences of capital letters +| = total number of capital letters in the e-mail +| +| 1 nominal {0,1} class attribute of type spam +| = denotes whether the e-mail was considered spam (1) or not (0), +| i.e. unsolicited commercial e-mail. +| +| For more information, see file 'spambase.DOCUMENTATION' at the +| UCI Machine Learning Repository: http://www.ics.uci.edu/~mlearn/MLRepository.html + + +1, 0. | spam, non-spam classes + +word_freq_make: continuous. +word_freq_address: continuous. +word_freq_all: continuous. +word_freq_3d: continuous. +word_freq_our: continuous. +word_freq_over: continuous. +word_freq_remove: continuous. +word_freq_internet: continuous. +word_freq_order: continuous. +word_freq_mail: continuous. +word_freq_receive: continuous. +word_freq_will: continuous. +word_freq_people: continuous. +word_freq_report: continuous. +word_freq_addresses: continuous. +word_freq_free: continuous. +word_freq_business: continuous. +word_freq_email: continuous. +word_freq_you: continuous. +word_freq_credit: continuous. +word_freq_your: continuous. +word_freq_font: continuous. +word_freq_000: continuous. +word_freq_money: continuous. +word_freq_hp: continuous. +word_freq_hpl: continuous. +word_freq_george: continuous. +word_freq_650: continuous. +word_freq_lab: continuous. +word_freq_labs: continuous. +word_freq_telnet: continuous. +word_freq_857: continuous. +word_freq_data: continuous. +word_freq_415: continuous. +word_freq_85: continuous. +word_freq_technology: continuous. +word_freq_1999: continuous. +word_freq_parts: continuous. +word_freq_pm: continuous. +word_freq_direct: continuous. +word_freq_cs: continuous. +word_freq_meeting: continuous. +word_freq_original: continuous. +word_freq_project: continuous. +word_freq_re: continuous. +word_freq_edu: continuous. +word_freq_table: continuous. +word_freq_conference: continuous. +char_freq_;: continuous. +char_freq_(: continuous. +char_freq_[: continuous. +char_freq_!: continuous. +char_freq_$: continuous. +char_freq_#: continuous. +capital_run_length_average: continuous. +capital_run_length_longest: continuous. +capital_run_length_total: continuous. -- cgit v1.2.3