Data Science
This is a project I did for ETL in Google Clousd Platform.
Data was a dummy created with Python.
import csv
from faker import Faker
import pandas as pd
fake = Faker()
def sanitize_text(text):
"""Removes newlines, extra spaces, and ensures proper formatting."""
return text.replace('\n', ' ').replace('\r', ' ').replace(',', ' ').strip()
def generate_employee_data(num_employees=1000):
with open("cleaned_employee_data.csv", "w", newline="", encoding="utf-8") as csvfile:
fieldnames = [
"employee_id", "first_name", "last_name", "email", "phone_number", "address", "birthdate",
"hire_date", "job_title", "department", "salary", "password"
]
This is a project I did to Predict if a patient is Hep or not based parameter.
Dataset contains laboratory values of blood donors and Hepatitis C patients and demographic values like age.
!pip install pyspark
# Load our Pkgs
from pyspark import SparkContext
# Spark
spark = SparkSession.builder.appName("MLwithSpark").getOrCreate()
# Load our dataset
df = spark.read.csv("/content/drive/MyDrive/Colab Notebooks/Data/hcvdata.csv",header=True,inferSchema=True)
train_df,test_df = vec_df.randomSplit([0.7,0.3])
from pyspark.ml.classification import LogisticRegression,DecisionTreeClassifier
# Logist Model
lr = LogisticRegression(featuresCol='features',labelCol='Target')
lr_model = lr.fit(train_df)
y_pred = lr_model.transform(test_df)
y_pred.show()
y_pred.select('target','rawPrediction', 'probability', 'prediction').show()
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# How to Check For Accuracy
multi_evaluator = MulticlassClassificationEvaluator(labelCol='Target',metricName='accuracy')
multi_evaluator.evaluate(y_pred)
from pyspark.mllib.evaluation import MulticlassMetrics
lr_metric = MulticlassMetrics(y_pred['target', 'prediction'].rdd)
print("Accuracy",lr_metric.accuracy)
print("Precision",lr_metric.precision(1.0))
print("Recall",lr_metric.recall(1.0))
print("F1Score",lr_metric.fMeasure(1.0))
—
This is a project I did for Sentiment Analysis on X (Twitter).
Data was get from X through their API. (Sentiment analysis with data from twitter:)
auth = tweepy.OAuthHandler(consumerKey, consumerSecret)
auth.set_access_token(accessToken, accessTokenSecret)
api = tweepy.API(auth)
for tweet in tweets:
#print(tweet.text)
tweet_list.append(tweet.text)
analysis = TextBlob(tweet.text)
score = SentimentIntensityAnalyzer().polarity_scores(tweet.text)
neg = score['neg']
neu = score['neu']
pos = score['pos']
comp = score['compound']
polarity += analysis.sentiment.polarity
tw_list_negative = tw_list[tw_list["sentiment"]=="negative"]
tw_list_positive = tw_list[tw_list["sentiment"]=="positive"]
tw_list_neutral = tw_list[tw_list["sentiment"]=="neutral"]