{ "cells": [ { "cell_type": "markdown", "id": "3ccaa5aa", "metadata": {}, "source": [ "# NEWS SENTIMENT ANALYSIS - ETL-Pipeline using KAFKA-HADOOP-SPARK " ] }, { "cell_type": "code", "execution_count": 1, "id": "cbde3506", "metadata": {}, "outputs": [], "source": [ "from pyspark.ml import Pipeline\n", "from pyspark.ml.classification import RandomForestClassifier\n", "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n", "from pyspark.ml.feature import StopWordsRemover, Tokenizer, Word2Vec\n", "from pyspark.sql import SparkSession\n", "from pyspark.sql.functions import lower, when\n", "import json" ] }, { "cell_type": "code", "execution_count": 2, "id": "03ae1824", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "23/12/22 05:55:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n", "Setting default log level to \"WARN\".\n", "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" ] }, { "data": { "text/html": [ "\n", "
\n", "

SparkSession - in-memory

\n", " \n", "
\n", "

SparkContext

\n", "\n", "

Spark UI

\n", "\n", "
\n", "
Version
\n", "
v3.0.0
\n", "
Master
\n", "
local[*]
\n", "
AppName
\n", "
News Sentiment analysis
\n", "
\n", "
\n", " \n", "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# spark = SparkSession.builder.master(\"spark://localhost:7077\").appName(\"News Sentiment analysis\").getOrCreate()\n", "spark = SparkSession.builder.master(\"local[*]\").appName(\"News Sentiment analysis\").getOrCreate()\n", "spark" ] }, { "cell_type": "markdown", "id": "6a76e45e", "metadata": {}, "source": [ "## Load data from HDFS" ] }, { "cell_type": "code", "execution_count": 3, "id": "7e8a1d16", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " \r" ] } ], "source": [ "hdfsPath = \"hdfs://namenode:9000/user/spark/news_data_articles.txt\"\n", "df = spark.read.json(hdfsPath)" ] }, { "cell_type": "code", "execution_count": 4, "id": "0bbcb94e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+\n", "| author| content| description| published_at| source| title| url| url_to_image|\n", "+-------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+\n", "|bbcnews|Thirty years afte...|The BBC's Fergal ...|2023-12-03 22:21:04|[bbc-news, BBC News]|South Africa: The...|https://www.bbc.c...|https://ichef.bbc...|\n", "|bbcnews|South Florida. A ...|The Grand Theft A...|2023-12-10 00:56:54|[bbc-news, BBC News]|Grand Theft Auto ...|https://www.bbc.c...|https://ichef.bbc...|\n", "|bbcnews|The families of t...|Police in Vermont...|2023-11-27 00:31:15|[bbc-news, BBC News]|Vermont: Three Pa...|https://www.bbc.c...|https://ichef.bbc...|\n", "|bbcnews|Homosexuality sho...|Cardinal Peter Tu...|2023-11-27 12:59:02|[bbc-news, BBC News]|Ghana Cardinal Pe...|https://www.bbc.c...|https://ichef.bbc...|\n", "|bbcnews|A key political a...|A two-year-old ta...|2023-11-23 06:00:31|[bbc-news, BBC News]|Missing Ukrainian...|https://www.bbc.c...|https://ichef.bbc...|\n", "|bbcnews|At the end of the...|Even when the war...|2023-12-09 09:48:23|[bbc-news, BBC News]|Israel-Gaza: The ...|https://www.bbc.c...|https://ichef.bbc...|\n", "|bbcnews|A manhunt is unde...|A manhunt is unde...|2023-12-11 13:10:39|[bbc-news, BBC News]|Switzerland manhu...|https://www.bbc.c...|https://ichef.bbc...|\n", "|bbcnews|It's all rather s...|Karin Kneissl dan...|2023-12-07 06:01:28|[bbc-news, BBC News]|Karin Kneissl, th...|https://www.bbc.c...|https://ichef.bbc...|\n", "|bbcnews|Russian President...|The Russian leade...|2023-12-06 12:56:45|[bbc-news, BBC News]|Putin lands in UA...|https://www.bbc.c...|https://ichef.bbc...|\n", "|bbcnews|Serbians are voti...|The ruling party ...|2023-12-17 13:17:45|[bbc-news, BBC News]|Serbians head to ...|https://www.bbc.c...|https://ichef.bbc...|\n", "|bbcnews|Police in Mexico ...|The victims were ...|2023-12-18 12:51:22|[bbc-news, BBC News]|Mexico violence: ...|https://www.bbc.c...|https://ichef.bbc...|\n", "|bbcnews|A homeless man mo...|When a drifter mo...|2023-11-26 01:11:32|[bbc-news, BBC News]|The homeless hand...|https://www.bbc.c...|https://ichef.bbc...|\n", "|bbcnews|It was, perhaps, ...|One man fined €40...|2023-11-27 17:41:30|[bbc-news, BBC News]|Slovenia Covid: T...|https://www.bbc.c...|https://ichef.bbc...|\n", "|bbcnews|Ukraine has launc...|Kyiv investigates...|2023-12-03 16:03:56|[bbc-news, BBC News]|Ukraine war: Russ...|https://www.bbc.c...|https://ichef.bbc...|\n", "|bbcnews|When New Zealand'...|The model for an ...|2023-12-11 00:26:33|[bbc-news, BBC News]|New Zealand smoki...|https://www.bbc.c...|https://ichef.bbc...|\n", "|Unknown|Students are bein...|Doctors have been...|2023-11-20 06:15:13|[bbc-news, BBC News]|Knife crime: 'We'...|https://www.bbc.c...|https://ichef.bbc...|\n", "|bbcnews|The murder of a y...|A vigil for healt...|2023-12-06 14:53:29|[bbc-news, BBC News]|Lurgan murder: Od...|https://www.bbc.c...|https://ichef.bbc...|\n", "|bbcnews|Super-strength st...|Synthetic opioids...|2023-12-11 00:44:28|[bbc-news, BBC News]|Street drugs stro...|https://www.bbc.c...|https://ichef.bbc...|\n", "|bbcnews|Michelle Mone say...|PPE Medpro is bei...|2023-12-10 13:15:27|[bbc-news, BBC News]|Tory peer Michell...|https://www.bbc.c...|https://ichef.bbc...|\n", "|bbcnews|All 221 Republica...|Republicans retur...|2023-12-15 18:47:13|[bbc-news, BBC News]|Biden impeachment...|https://www.bbc.c...|https://ichef.bbc...|\n", "+-------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "df.show()" ] }, { "cell_type": "markdown", "id": "91492eec", "metadata": {}, "source": [ "# Data Preparation" ] }, { "cell_type": "code", "execution_count": 5, "id": "5404d2be", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-----+-----+\n", "|label|count|\n", "+-----+-----+\n", "| 1| 65|\n", "| 0| 546|\n", "+-----+-----+\n", "\n" ] } ], "source": [ "labeled_data = df.withColumn(\"label\", when(lower(df[\"description\"]).contains(\"crime\") |\n", " lower(df[\"description\"]).contains(\"murder\") |\n", " lower(df[\"description\"]).contains(\"robbery\"), 1).otherwise(0))\n", "labeled_data.groupBy(\"label\").count().show()" ] }, { "cell_type": "markdown", "id": "f27a8072", "metadata": {}, "source": [ "### Define a function to convert text to lowercase and tokenize it" ] }, { "cell_type": "code", "execution_count": 6, "id": "fd574468", "metadata": {}, "outputs": [], "source": [ "def tokenize_text(df):\n", " tokenizer = Tokenizer(inputCol=\"description\", outputCol=\"words\")\n", " return tokenizer.transform(df)" ] }, { "cell_type": "code", "execution_count": 7, "id": "9605eb57", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+\n", "| author| content| description| published_at| source| title| url| url_to_image|label| words|\n", "+-------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+\n", "|bbcnews|Thirty years afte...|The BBC's Fergal ...|2023-12-03 22:21:04|[bbc-news, BBC News]|South Africa: The...|https://www.bbc.c...|https://ichef.bbc...| 0|[the, bbc's, ferg...|\n", "|bbcnews|South Florida. A ...|The Grand Theft A...|2023-12-10 00:56:54|[bbc-news, BBC News]|Grand Theft Auto ...|https://www.bbc.c...|https://ichef.bbc...| 0|[the, grand, thef...|\n", "|bbcnews|The families of t...|Police in Vermont...|2023-11-27 00:31:15|[bbc-news, BBC News]|Vermont: Three Pa...|https://www.bbc.c...|https://ichef.bbc...| 0|[police, in, verm...|\n", "|bbcnews|Homosexuality sho...|Cardinal Peter Tu...|2023-11-27 12:59:02|[bbc-news, BBC News]|Ghana Cardinal Pe...|https://www.bbc.c...|https://ichef.bbc...| 0|[cardinal, peter,...|\n", "|bbcnews|A key political a...|A two-year-old ta...|2023-11-23 06:00:31|[bbc-news, BBC News]|Missing Ukrainian...|https://www.bbc.c...|https://ichef.bbc...| 0|[a, two-year-old,...|\n", "|bbcnews|At the end of the...|Even when the war...|2023-12-09 09:48:23|[bbc-news, BBC News]|Israel-Gaza: The ...|https://www.bbc.c...|https://ichef.bbc...| 0|[even, when, the,...|\n", "|bbcnews|A manhunt is unde...|A manhunt is unde...|2023-12-11 13:10:39|[bbc-news, BBC News]|Switzerland manhu...|https://www.bbc.c...|https://ichef.bbc...| 0|[a, manhunt, is, ...|\n", "|bbcnews|It's all rather s...|Karin Kneissl dan...|2023-12-07 06:01:28|[bbc-news, BBC News]|Karin Kneissl, th...|https://www.bbc.c...|https://ichef.bbc...| 0|[karin, kneissl, ...|\n", "|bbcnews|Russian President...|The Russian leade...|2023-12-06 12:56:45|[bbc-news, BBC News]|Putin lands in UA...|https://www.bbc.c...|https://ichef.bbc...| 0|[the, russian, le...|\n", "|bbcnews|Serbians are voti...|The ruling party ...|2023-12-17 13:17:45|[bbc-news, BBC News]|Serbians head to ...|https://www.bbc.c...|https://ichef.bbc...| 0|[the, ruling, par...|\n", "|bbcnews|Police in Mexico ...|The victims were ...|2023-12-18 12:51:22|[bbc-news, BBC News]|Mexico violence: ...|https://www.bbc.c...|https://ichef.bbc...| 0|[the, victims, we...|\n", "|bbcnews|A homeless man mo...|When a drifter mo...|2023-11-26 01:11:32|[bbc-news, BBC News]|The homeless hand...|https://www.bbc.c...|https://ichef.bbc...| 0|[when, a, drifter...|\n", "|bbcnews|It was, perhaps, ...|One man fined €40...|2023-11-27 17:41:30|[bbc-news, BBC News]|Slovenia Covid: T...|https://www.bbc.c...|https://ichef.bbc...| 0|[one, man, fined,...|\n", "|bbcnews|Ukraine has launc...|Kyiv investigates...|2023-12-03 16:03:56|[bbc-news, BBC News]|Ukraine war: Russ...|https://www.bbc.c...|https://ichef.bbc...| 0|[kyiv, investigat...|\n", "|bbcnews|When New Zealand'...|The model for an ...|2023-12-11 00:26:33|[bbc-news, BBC News]|New Zealand smoki...|https://www.bbc.c...|https://ichef.bbc...| 0|[the, model, for,...|\n", "|Unknown|Students are bein...|Doctors have been...|2023-11-20 06:15:13|[bbc-news, BBC News]|Knife crime: 'We'...|https://www.bbc.c...|https://ichef.bbc...| 0|[doctors, have, b...|\n", "|bbcnews|The murder of a y...|A vigil for healt...|2023-12-06 14:53:29|[bbc-news, BBC News]|Lurgan murder: Od...|https://www.bbc.c...|https://ichef.bbc...| 0|[a, vigil, for, h...|\n", "|bbcnews|Super-strength st...|Synthetic opioids...|2023-12-11 00:44:28|[bbc-news, BBC News]|Street drugs stro...|https://www.bbc.c...|https://ichef.bbc...| 1|[synthetic, opioi...|\n", "|bbcnews|Michelle Mone say...|PPE Medpro is bei...|2023-12-10 13:15:27|[bbc-news, BBC News]|Tory peer Michell...|https://www.bbc.c...|https://ichef.bbc...| 1|[ppe, medpro, is,...|\n", "|bbcnews|All 221 Republica...|Republicans retur...|2023-12-15 18:47:13|[bbc-news, BBC News]|Biden impeachment...|https://www.bbc.c...|https://ichef.bbc...| 0|[republicans, ret...|\n", "+-------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "tokenizer = Tokenizer(inputCol=\"description\", outputCol=\"words\")\n", "tokenized_data = tokenize_text(labeled_data)\n", "tokenized_data.show()" ] }, { "cell_type": "markdown", "id": "dd9ea063", "metadata": {}, "source": [ "### Define the StopWordsRemover and Create the \"filteredWords\" column" ] }, { "cell_type": "code", "execution_count": 8, "id": "38decca9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+\n", "| author| content| description| published_at| source| title| url| url_to_image|label| words| filteredWords|\n", "+-------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+\n", "|bbcnews|Thirty years afte...|The BBC's Fergal ...|2023-12-03 22:21:04|[bbc-news, BBC News]|South Africa: The...|https://www.bbc.c...|https://ichef.bbc...| 0|[the, bbc's, ferg...|[bbc's, fergal, k...|\n", "|bbcnews|South Florida. A ...|The Grand Theft A...|2023-12-10 00:56:54|[bbc-news, BBC News]|Grand Theft Auto ...|https://www.bbc.c...|https://ichef.bbc...| 0|[the, grand, thef...|[grand, theft, au...|\n", "|bbcnews|The families of t...|Police in Vermont...|2023-11-27 00:31:15|[bbc-news, BBC News]|Vermont: Three Pa...|https://www.bbc.c...|https://ichef.bbc...| 0|[police, in, verm...|[police, vermont,...|\n", "|bbcnews|Homosexuality sho...|Cardinal Peter Tu...|2023-11-27 12:59:02|[bbc-news, BBC News]|Ghana Cardinal Pe...|https://www.bbc.c...|https://ichef.bbc...| 0|[cardinal, peter,...|[cardinal, peter,...|\n", "|bbcnews|A key political a...|A two-year-old ta...|2023-11-23 06:00:31|[bbc-news, BBC News]|Missing Ukrainian...|https://www.bbc.c...|https://ichef.bbc...| 0|[a, two-year-old,...|[two-year-old, ta...|\n", "|bbcnews|At the end of the...|Even when the war...|2023-12-09 09:48:23|[bbc-news, BBC News]|Israel-Gaza: The ...|https://www.bbc.c...|https://ichef.bbc...| 0|[even, when, the,...|[even, war, israe...|\n", "|bbcnews|A manhunt is unde...|A manhunt is unde...|2023-12-11 13:10:39|[bbc-news, BBC News]|Switzerland manhu...|https://www.bbc.c...|https://ichef.bbc...| 0|[a, manhunt, is, ...|[manhunt, way, su...|\n", "|bbcnews|It's all rather s...|Karin Kneissl dan...|2023-12-07 06:01:28|[bbc-news, BBC News]|Karin Kneissl, th...|https://www.bbc.c...|https://ichef.bbc...| 0|[karin, kneissl, ...|[karin, kneissl, ...|\n", "|bbcnews|Russian President...|The Russian leade...|2023-12-06 12:56:45|[bbc-news, BBC News]|Putin lands in UA...|https://www.bbc.c...|https://ichef.bbc...| 0|[the, russian, le...|[russian, leader,...|\n", "|bbcnews|Serbians are voti...|The ruling party ...|2023-12-17 13:17:45|[bbc-news, BBC News]|Serbians head to ...|https://www.bbc.c...|https://ichef.bbc...| 0|[the, ruling, par...|[ruling, party, f...|\n", "|bbcnews|Police in Mexico ...|The victims were ...|2023-12-18 12:51:22|[bbc-news, BBC News]|Mexico violence: ...|https://www.bbc.c...|https://ichef.bbc...| 0|[the, victims, we...|[victims, young, ...|\n", "|bbcnews|A homeless man mo...|When a drifter mo...|2023-11-26 01:11:32|[bbc-news, BBC News]|The homeless hand...|https://www.bbc.c...|https://ichef.bbc...| 0|[when, a, drifter...|[drifter, moves, ...|\n", "|bbcnews|It was, perhaps, ...|One man fined €40...|2023-11-27 17:41:30|[bbc-news, BBC News]|Slovenia Covid: T...|https://www.bbc.c...|https://ichef.bbc...| 0|[one, man, fined,...|[one, man, fined,...|\n", "|bbcnews|Ukraine has launc...|Kyiv investigates...|2023-12-03 16:03:56|[bbc-news, BBC News]|Ukraine war: Russ...|https://www.bbc.c...|https://ichef.bbc...| 0|[kyiv, investigat...|[kyiv, investigat...|\n", "|bbcnews|When New Zealand'...|The model for an ...|2023-12-11 00:26:33|[bbc-news, BBC News]|New Zealand smoki...|https://www.bbc.c...|https://ichef.bbc...| 0|[the, model, for,...|[model, end, toba...|\n", "|Unknown|Students are bein...|Doctors have been...|2023-11-20 06:15:13|[bbc-news, BBC News]|Knife crime: 'We'...|https://www.bbc.c...|https://ichef.bbc...| 0|[doctors, have, b...|[doctors, teachin...|\n", "|bbcnews|The murder of a y...|A vigil for healt...|2023-12-06 14:53:29|[bbc-news, BBC News]|Lurgan murder: Od...|https://www.bbc.c...|https://ichef.bbc...| 0|[a, vigil, for, h...|[vigil, healthcar...|\n", "|bbcnews|Super-strength st...|Synthetic opioids...|2023-12-11 00:44:28|[bbc-news, BBC News]|Street drugs stro...|https://www.bbc.c...|https://ichef.bbc...| 1|[synthetic, opioi...|[synthetic, opioi...|\n", "|bbcnews|Michelle Mone say...|PPE Medpro is bei...|2023-12-10 13:15:27|[bbc-news, BBC News]|Tory peer Michell...|https://www.bbc.c...|https://ichef.bbc...| 1|[ppe, medpro, is,...|[ppe, medpro, sue...|\n", "|bbcnews|All 221 Republica...|Republicans retur...|2023-12-15 18:47:13|[bbc-news, BBC News]|Biden impeachment...|https://www.bbc.c...|https://ichef.bbc...| 0|[republicans, ret...|[republicans, ret...|\n", "+-------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "remover = StopWordsRemover(inputCol=\"words\", outputCol=\"filteredWords\")\n", "filtered_data = remover.transform(tokenized_data)\n", "filtered_data.show()" ] }, { "cell_type": "markdown", "id": "8bbdfa86", "metadata": {}, "source": [ "### Create a Word2Vec model" ] }, { "cell_type": "code", "execution_count": 9, "id": "8829801e", "metadata": {}, "outputs": [], "source": [ "word2Vec = Word2Vec(inputCol=\"filteredWords\", outputCol=\"features\", vectorSize=100, minCount=0)" ] }, { "cell_type": "markdown", "id": "02aafcaa", "metadata": {}, "source": [ "# Model Training" ] }, { "cell_type": "markdown", "id": "0a735363", "metadata": {}, "source": [ "### Create a RandomForestClassifier" ] }, { "cell_type": "code", "execution_count": 10, "id": "00a0aa2b", "metadata": {}, "outputs": [], "source": [ "rf = RandomForestClassifier(labelCol=\"label\", featuresCol=\"features\", numTrees=10, rawPredictionCol=\"rawPrediction\")\n" ] }, { "cell_type": "markdown", "id": "dd99d22b", "metadata": {}, "source": [ "### Create a pipeline with the stages" ] }, { "cell_type": "code", "execution_count": 11, "id": "49b0a36a", "metadata": {}, "outputs": [], "source": [ "pipeline = Pipeline(stages=[tokenizer, remover, word2Vec, rf])" ] }, { "cell_type": "markdown", "id": "240a8496", "metadata": {}, "source": [ "### Split the data into training and testing sets" ] }, { "cell_type": "code", "execution_count": 12, "id": "5d821413", "metadata": {}, "outputs": [], "source": [ "training_data, test_data = labeled_data.randomSplit([0.7, 0.3], seed=12345)\n" ] }, { "cell_type": "markdown", "id": "1792eeea", "metadata": {}, "source": [ "### Train the model" ] }, { "cell_type": "code", "execution_count": 13, "id": "cf4f760e", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "23/12/22 05:55:25 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS\n", "23/12/22 05:55:25 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS\n" ] } ], "source": [ "model = pipeline.fit(training_data)" ] }, { "cell_type": "markdown", "id": "a092e015", "metadata": {}, "source": [ "## Make predictions on the test set" ] }, { "cell_type": "code", "execution_count": 14, "id": "6a862c2d", "metadata": {}, "outputs": [], "source": [ "predictions = model.transform(test_data)" ] }, { "cell_type": "markdown", "id": "946ce7ed", "metadata": {}, "source": [ "## Evaluate the model" ] }, { "cell_type": "code", "execution_count": 15, "id": "51c34a39", "metadata": {}, "outputs": [], "source": [ "evaluator = MulticlassClassificationEvaluator(labelCol=\"label\", predictionCol=\"prediction\", metricName=\"accuracy\")" ] }, { "cell_type": "code", "execution_count": 16, "id": "1650a6fc", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.8806818181818182" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "accuracy = evaluator.evaluate(predictions)\n", "accuracy" ] }, { "cell_type": "markdown", "id": "c49b44b0", "metadata": {}, "source": [ "# Save the model" ] }, { "cell_type": "code", "execution_count": 17, "id": "1beb353f", "metadata": {}, "outputs": [], "source": [ "from datetime import datetime \n", "timestamp = datetime.now().strftime(\"%Y%m%d%H%M%S\")\n", "save_path=f\"hdfs://namenode:9000/user/spark/saved_models/news_sentiment_multi_classification_model_{timestamp}\"" ] }, { "cell_type": "code", "execution_count": 18, "id": "aa7cbdb3", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "23/12/22 05:55:30 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 96.54% for 7 writers\n", "23/12/22 05:55:30 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 84.47% for 8 writers\n", "23/12/22 05:55:30 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 75.08% for 9 writers\n", "23/12/22 05:55:30 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 67.58% for 10 writers\n", "23/12/22 05:55:30 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 75.08% for 9 writers\n", "23/12/22 05:55:30 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 84.47% for 8 writers\n", "23/12/22 05:55:30 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 96.54% for 7 writers\n", "23/12/22 05:55:31 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 96.54% for 7 writers\n", "23/12/22 05:55:31 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 84.47% for 8 writers\n", "23/12/22 05:55:31 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 75.08% for 9 writers\n", "23/12/22 05:55:31 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 67.58% for 10 writers\n", "23/12/22 05:55:31 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 61.43% for 11 writers\n", "23/12/22 05:55:31 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 67.58% for 10 writers\n", "23/12/22 05:55:31 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 75.08% for 9 writers\n", "23/12/22 05:55:31 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 84.47% for 8 writers\n", "23/12/22 05:55:31 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 96.54% for 7 writers\n" ] } ], "source": [ "model.save(save_path)" ] }, { "cell_type": "markdown", "id": "72dbcc43", "metadata": {}, "source": [ "# Load the model and predict the latest news result" ] }, { "cell_type": "code", "execution_count": 19, "id": "9c651520", "metadata": {}, "outputs": [], "source": [ "from pyspark.ml import PipelineModel" ] }, { "cell_type": "code", "execution_count": 20, "id": "245ce144", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " \r" ] } ], "source": [ "loaded_model = PipelineModel.load(save_path)" ] }, { "cell_type": "code", "execution_count": 21, "id": "3fcf5e3e", "metadata": {}, "outputs": [], "source": [ "# Make predictions on new data\n", "hdfsPath = \"hdfs://namenode:9000/user/spark/news_data_articles.txt\"\n", "new_data = spark.read.json(hdfsPath)\n", "\n", "labeled_data = df.withColumn(\"label\", when(lower(df[\"description\"]).contains(\"crime\") |\n", " lower(df[\"description\"]).contains(\"murder\") |\n", " lower(df[\"description\"]).contains(\"robbery\"), 1).otherwise(0))\n", "new_predictions = loaded_model.transform(labeled_data)\n", "\n", "# Evaluate the predictions on new data (if ground truth labels are available)\n", "new_accuracy = evaluator.evaluate(new_predictions)" ] }, { "cell_type": "code", "execution_count": 22, "id": "2e77b736", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9083469721767594" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_accuracy" ] }, { "cell_type": "markdown", "id": "a55f975a", "metadata": {}, "source": [ "# Post Predictions" ] }, { "cell_type": "markdown", "id": "40f161ab", "metadata": {}, "source": [ "### Evaluate the model using relevant metrics and visualization" ] }, { "cell_type": "code", "execution_count": 23, "id": "c52e90c0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Precision: 0.0\n", "Recall: 0.0\n", "F1 Score: 0.0\n", "Accuracy: 0.8806818181818182\n" ] } ], "source": [ "from pyspark.mllib.evaluation import MulticlassMetrics\n", "\n", "prediction_and_label = predictions.select(\"prediction\", \"label\").rdd.map(lambda row: (float(row[\"prediction\"]), float(row[\"label\"])))\n", "\n", "# Create MulticlassMetrics object\n", "metrics = MulticlassMetrics(prediction_and_label)\n", "\n", "# Overall statistics\n", "print(f\"Precision: {metrics.precision(1.0)}\")\n", "print(f\"Recall: {metrics.recall(1.0)}\") \n", "print(f\"F1 Score: {metrics.fMeasure(1.0)}\") \n", "print(f\"Accuracy: {metrics.accuracy}\")\n", "\n", "\n", "# from pyspark.mllib.evaluation import BinaryClassificationMetrics\n", "# prediction_and_label = predictions.select(\"rawPrediction\", \"label\").rdd.map(lambda row: (float(row[\"rawPrediction\"][1]), float(row[\"label\"])))\n", "# metrics = BinaryClassificationMetrics(predictions_and_labels)\n", "# print(f\"Area under ROC: {metrics.areaUnderROC}\")\n", "# evaluator = MulticlassClassificationEvaluator(labelCol=\"label\", predictionCol=\"prediction\", metricName=\"accuracy\")" ] }, { "cell_type": "code", "execution_count": 24, "id": "3c703d40", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: matplotlib in /usr/local/lib/python3.9/dist-packages (3.8.2)\n", "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.9/dist-packages (from matplotlib) (1.2.0)\n", "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.9/dist-packages (from matplotlib) (0.12.1)\n", "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib) (4.47.0)\n", "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.9/dist-packages (from matplotlib) (1.4.5)\n", "Requirement already satisfied: numpy<2,>=1.21 in /usr/local/lib/python3.9/dist-packages (from matplotlib) (1.26.2)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib) (23.2)\n", "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.9/dist-packages (from matplotlib) (10.1.0)\n", "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.9/dist-packages (from matplotlib) (3.1.1)\n", "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.9/dist-packages (from matplotlib) (2.8.2)\n", "Requirement already satisfied: importlib-resources>=3.2.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib) (6.1.1)\n", "Requirement already satisfied: zipp>=3.1.0 in /usr/local/lib/python3.9/dist-packages (from importlib-resources>=3.2.0->matplotlib) (3.17.0)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.9/dist-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", "\u001b[0mRequirement already satisfied: seaborn in /usr/local/lib/python3.9/dist-packages (0.13.0)\n", "Requirement already satisfied: numpy!=1.24.0,>=1.20 in /usr/local/lib/python3.9/dist-packages (from seaborn) (1.26.2)\n", "Requirement already satisfied: pandas>=1.2 in /usr/local/lib/python3.9/dist-packages (from seaborn) (2.1.4)\n", "Requirement already satisfied: matplotlib!=3.6.1,>=3.3 in /usr/local/lib/python3.9/dist-packages (from seaborn) (3.8.2)\n", "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.3->seaborn) (1.2.0)\n", "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.3->seaborn) (0.12.1)\n", "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.3->seaborn) (4.47.0)\n", "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.3->seaborn) (1.4.5)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.3->seaborn) (23.2)\n", "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.3->seaborn) (10.1.0)\n", "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.3->seaborn) (3.1.1)\n", "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.3->seaborn) (2.8.2)\n", "Requirement already satisfied: importlib-resources>=3.2.0 in /usr/local/lib/python3.9/dist-packages (from matplotlib!=3.6.1,>=3.3->seaborn) (6.1.1)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.9/dist-packages (from pandas>=1.2->seaborn) (2023.3.post1)\n", "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.9/dist-packages (from pandas>=1.2->seaborn) (2023.3)\n", "Requirement already satisfied: zipp>=3.1.0 in /usr/local/lib/python3.9/dist-packages (from importlib-resources>=3.2.0->matplotlib!=3.6.1,>=3.3->seaborn) (3.17.0)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.9/dist-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.3->seaborn) (1.16.0)\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", "\u001b[0mRequirement already satisfied: scikit-learn in /usr/local/lib/python3.9/dist-packages (1.3.2)\n", "Requirement already satisfied: numpy<2.0,>=1.17.3 in /usr/local/lib/python3.9/dist-packages (from scikit-learn) (1.26.2)\n", "Requirement already satisfied: scipy>=1.5.0 in /usr/local/lib/python3.9/dist-packages (from scikit-learn) (1.11.4)\n", "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.9/dist-packages (from scikit-learn) (1.3.2)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.9/dist-packages (from scikit-learn) (3.2.0)\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", "\u001b[0m" ] } ], "source": [ "!pip install matplotlib\n", "!pip install seaborn\n", "!pip install scikit-learn\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from sklearn.metrics import confusion_matrix,roc_curve" ] }, { "cell_type": "markdown", "id": "c853d69c", "metadata": {}, "source": [ "### Confusion Matrix\n", " Examine the confusion matrix to understand the distribution of true positives, true negatives, false positives, and false negatives." ] }, { "cell_type": "code", "execution_count": 25, "id": "c36577f2", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "conf_matrix = confusion_matrix(predictions.select(\"label\").collect(), predictions.select(\"prediction\").collect())\n", "sns.heatmap(conf_matrix, annot=True, fmt=\"d\", cmap=\"Blues\")\n", "plt.xlabel(\"Predicted\")\n", "plt.ylabel(\"Actual\")\n", "plt.title(\"Confusion Matrix\")\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "bc98f62c", "metadata": {}, "source": [ "### Interpretation\n", "inspect feature importance scores to understand which features contribute most to predictions." ] }, { "cell_type": "code", "execution_count": 26, "id": "9ae56246", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Feature Importances:\n", "Feature 1: 0.015051080097373496\n", "Feature 2: 0.0002873794110582124\n", "Feature 3: 0.01332814031334399\n", "Feature 4: 0.0\n", "Feature 5: 0.0\n", "Feature 6: 0.0\n", "Feature 7: 0.019113264004936917\n", "Feature 8: 0.015064460075717984\n", "Feature 9: 0.014789811418442484\n", "Feature 10: 0.012263786258543456\n", "Feature 11: 0.0036664835219485485\n", "Feature 12: 0.0\n", "Feature 13: 0.010282457713879428\n", "Feature 14: 0.0\n", "Feature 15: 0.0037138960905618843\n", "Feature 16: 0.0\n", "Feature 17: 0.005921216051342099\n", "Feature 18: 0.015603721533278981\n", "Feature 19: 0.00433138753162371\n", "Feature 20: 0.0\n", "Feature 21: 0.0\n", "Feature 22: 0.017442331773253494\n", "Feature 23: 0.0\n", "Feature 24: 0.017086010558358226\n", "Feature 25: 0.00382297876028498\n", "Feature 26: 0.0\n", "Feature 27: 0.0\n", "Feature 28: 0.0\n", "Feature 29: 0.009828777231717087\n", "Feature 30: 0.0054620640660725285\n", "Feature 31: 0.0071588129266852655\n", "Feature 32: 0.019787704511670483\n", "Feature 33: 0.011630586259748912\n", "Feature 34: 0.0\n", "Feature 35: 0.0123551217832904\n", "Feature 36: 0.007273668455883249\n", "Feature 37: 0.0\n", "Feature 38: 0.02704607231853859\n", "Feature 39: 0.0\n", "Feature 40: 0.016488121681052556\n", "Feature 41: 0.01134731455210528\n", "Feature 42: 0.00441525866442517\n", "Feature 43: 0.00561919940965798\n", "Feature 44: 0.029141144522203323\n", "Feature 45: 0.0007150276262125509\n", "Feature 46: 0.015891176830203916\n", "Feature 47: 0.00883907430994379\n", "Feature 48: 0.0053535848334402055\n", "Feature 49: 0.0\n", "Feature 50: 0.0\n", "Feature 51: 0.0\n", "Feature 52: 0.018040508395540052\n", "Feature 53: 0.027209352303473152\n", "Feature 54: 0.020822936370520597\n", "Feature 55: 0.010537910297560061\n", "Feature 56: 0.017596989223655535\n", "Feature 57: 0.0\n", "Feature 58: 0.015309752835537396\n", "Feature 59: 0.0\n", "Feature 60: 0.04808339226472713\n", "Feature 61: 0.011003456535374823\n", "Feature 62: 0.008287661643170411\n", "Feature 63: 0.0\n", "Feature 64: 0.02627842644074586\n", "Feature 65: 0.018857009933697797\n", "Feature 66: 0.00021182520079705128\n", "Feature 67: 0.0304997344723786\n", "Feature 68: 0.0750252125596929\n", "Feature 69: 0.0\n", "Feature 70: 0.005647297261208397\n", "Feature 71: 0.020748097344815527\n", "Feature 72: 0.0\n", "Feature 73: 0.0\n", "Feature 74: 0.011137706974026758\n", "Feature 75: 0.01805635237077599\n", "Feature 76: 0.0\n", "Feature 77: 0.011212527737242146\n", "Feature 78: 0.0\n", "Feature 79: 0.03746679715442337\n", "Feature 80: 0.024097522808354022\n", "Feature 81: 0.01072801718424011\n", "Feature 82: 0.0\n", "Feature 83: 0.0037822651354894854\n", "Feature 84: 0.022589784097022714\n", "Feature 85: 0.006621122336113426\n", "Feature 86: 0.0030245984615245207\n", "Feature 87: 0.004278489829398828\n", "Feature 88: 0.017400495058326658\n", "Feature 89: 0.004732734467445651\n", "Feature 90: 0.001966535322976351\n", "Feature 91: 0.0\n", "Feature 92: 0.02625476239261398\n", "Feature 93: 0.00685432043116109\n", "Feature 94: 0.0\n", "Feature 95: 0.029561781984546637\n", "Feature 96: 0.0\n", "Feature 97: 0.02352814462359449\n", "Feature 98: 0.006647713461139032\n", "Feature 99: 0.0\n", "Feature 100: 0.005777649989890048\n" ] } ], "source": [ "feature_importances = model.stages[-1].featureImportances.toArray()\n", "print(\"Feature Importances:\")\n", "for i, importance in enumerate(feature_importances):\n", " print(f\"Feature {i + 1}: {importance}\")" ] }, { "cell_type": "markdown", "id": "a18a8b0e", "metadata": {}, "source": [ "### Visualize" ] }, { "cell_type": "code", "execution_count": 27, "id": "425705dc", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "prediction_and_label = predictions.select(\"rawPrediction\", \"label\").rdd.map(lambda row: (float(row[\"rawPrediction\"][1]), float(row[\"label\"])))\n", "\n", "\n", "prediction_and_label_list = prediction_and_label.collect()\n", "\n", "# Extract probabilities and labels\n", "probabilities, labels = zip(*prediction_and_label_list)\n", "\n", "# Calculate false positive rate (fpr) and true positive rate (tpr)\n", "fpr, tpr, _ = roc_curve(labels, probabilities)\n", "\n", "# Plot ROC curve\n", "plt.plot(fpr, tpr, label='ROC Curve')\n", "plt.xlabel('False Positive Rate')\n", "plt.ylabel('True Positive Rate')\n", "plt.title('ROC Curve')\n", "plt.legend()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 28, "id": "305c496b", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "probabilities = predictions.select(\"rawPrediction\").rdd.map(lambda row: float(row[\"rawPrediction\"][1])).collect()\n", "\n", "# Plot a histogram\n", "plt.hist(probabilities, bins=50, density=True, alpha=0.5, color='b')\n", "plt.xlabel('Predicted Probability for Class 1')\n", "plt.ylabel('Density')\n", "plt.title('Prediction Distribution')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 29, "id": "f58e4a2c", "metadata": {}, "outputs": [], "source": [ "spark.stop()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" } }, "nbformat": 4, "nbformat_minor": 5 }