{ "cells": [ { "cell_type": "markdown", "id": "3ccaa5aa", "metadata": {}, "source": [ "# NEWS SENTIMENT ANALYSIS - ETL-Pipeline using KAFKA-HADOOP-SPARK " ] }, { "cell_type": "code", "execution_count": 1, "id": "cbde3506", "metadata": {}, "outputs": [], "source": [ "from pyspark.ml import Pipeline\n", "from pyspark.ml.classification import RandomForestClassifier\n", "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n", "from pyspark.ml.feature import StopWordsRemover, Tokenizer, Word2Vec\n", "from pyspark.sql import SparkSession\n", "from pyspark.sql.functions import lower, when\n", "import json" ] }, { "cell_type": "code", "execution_count": 2, "id": "03ae1824", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "23/12/22 05:55:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n", "Setting default log level to \"WARN\".\n", "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" ] }, { "data": { "text/html": [ "\n", "
SparkSession - in-memory
\n", " \n", "SparkContext
\n", "\n", " \n", "\n", "v3.0.0
local[*]
News Sentiment analysis