In [19]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sqlite3
In [20]:
# =======================================================
# Laden der Daten aus SQLite-Datenbank
# -- Verbindung zur Datenbank und Abfrage der Reviews-Tabelle
# =======================================================
In [21]:
con = sqlite3.connect(r'C:\Users\Miso\Desktop\Data Analysis 3/database.sqlite')
In [22]:
type(con)
Out[22]:
sqlite3.Connection
In [23]:
df=pd.read_sql_query('SELECT * FROM Reviews', con)
In [24]:
df.head()
Out[24]:
Id | ProductId | UserId | ProfileName | HelpfulnessNumerator | HelpfulnessDenominator | Score | Time | Summary | Text | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | B001E4KFG0 | A3SGXH7AUHU8GW | delmartian | 1 | 1 | 5 | 1303862400 | Good Quality Dog Food | I have bought several of the Vitality canned d... |
1 | 2 | B00813GRG4 | A1D87F6ZCVE5NK | dll pa | 0 | 0 | 1 | 1346976000 | Not as Advertised | Product arrived labeled as Jumbo Salted Peanut... |
2 | 3 | B000LQOCH0 | ABXLMWJIXXAIN | Natalia Corres "Natalia Corres" | 1 | 1 | 4 | 1219017600 | "Delight" says it all | This is a confection that has been around a fe... |
3 | 4 | B000UA0QIQ | A395BORC6FGVXV | Karl | 3 | 3 | 2 | 1307923200 | Cough Medicine | If you are looking for the secret ingredient i... |
4 | 5 | B006K2ZZ7K | A1UQRSCLF8GW1T | Michael D. Bigham "M. Wassir" | 0 | 0 | 5 | 1350777600 | Great taffy | Great taffy at a great price. There was a wid... |
In [25]:
df.shape
Out[25]:
(568454, 10)
In [26]:
pd.read_csv(r'C:\Users\Miso\Desktop\Data Analysis 3\Reviews.csv')
Out[26]:
Id | ProductId | UserId | ProfileName | HelpfulnessNumerator | HelpfulnessDenominator | Score | Time | Summary | Text | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | B001E4KFG0 | A3SGXH7AUHU8GW | delmartian | 1 | 1 | 5 | 1303862400 | Good Quality Dog Food | I have bought several of the Vitality canned d... |
1 | 2 | B00813GRG4 | A1D87F6ZCVE5NK | dll pa | 0 | 0 | 1 | 1346976000 | Not as Advertised | Product arrived labeled as Jumbo Salted Peanut... |
2 | 3 | B000LQOCH0 | ABXLMWJIXXAIN | Natalia Corres "Natalia Corres" | 1 | 1 | 4 | 1219017600 | "Delight" says it all | This is a confection that has been around a fe... |
3 | 4 | B000UA0QIQ | A395BORC6FGVXV | Karl | 3 | 3 | 2 | 1307923200 | Cough Medicine | If you are looking for the secret ingredient i... |
4 | 5 | B006K2ZZ7K | A1UQRSCLF8GW1T | Michael D. Bigham "M. Wassir" | 0 | 0 | 5 | 1350777600 | Great taffy | Great taffy at a great price. There was a wid... |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
568449 | 568450 | B001EO7N10 | A28KG5XORO54AY | Lettie D. Carter | 0 | 0 | 5 | 1299628800 | Will not do without | Great for sesame chicken..this is a good if no... |
568450 | 568451 | B003S1WTCU | A3I8AFVPEE8KI5 | R. Sawyer | 0 | 0 | 2 | 1331251200 | disappointed | I'm disappointed with the flavor. The chocolat... |
568451 | 568452 | B004I613EE | A121AA1GQV751Z | pksd "pk_007" | 2 | 2 | 5 | 1329782400 | Perfect for our maltipoo | These stars are small, so you can give 10-15 o... |
568452 | 568453 | B004I613EE | A3IBEVCTXKNOH | Kathy A. Welch "katwel" | 1 | 1 | 5 | 1331596800 | Favorite Training and reward treat | These are the BEST treats for training and rew... |
568453 | 568454 | B001LR2CU2 | A3LGQPJCZVL9UC | srfell17 | 0 | 0 | 5 | 1338422400 | Great Honey | I am very satisfied ,product is as advertised,... |
568454 rows × 10 columns
In [27]:
!pip install textblob
from textblob import TextBlob
Requirement already satisfied: textblob in c:\users\miso\anaconda3\lib\site-packages (0.19.0) Requirement already satisfied: nltk>=3.9 in c:\users\miso\anaconda3\lib\site-packages (from textblob) (3.9.1) Requirement already satisfied: click in c:\users\miso\anaconda3\lib\site-packages (from nltk>=3.9->textblob) (8.1.8) Requirement already satisfied: joblib in c:\users\miso\anaconda3\lib\site-packages (from nltk>=3.9->textblob) (1.4.2) Requirement already satisfied: regex>=2021.8.3 in c:\users\miso\anaconda3\lib\site-packages (from nltk>=3.9->textblob) (2024.11.6) Requirement already satisfied: tqdm in c:\users\miso\anaconda3\lib\site-packages (from nltk>=3.9->textblob) (4.67.1) Requirement already satisfied: colorama in c:\users\miso\anaconda3\lib\site-packages (from click->nltk>=3.9->textblob) (0.4.6)
In [28]:
text=df['Summary'][0]
text
Out[28]:
'Good Quality Dog Food'
In [29]:
TextBlob(text).sentiment.polarity
Out[29]:
0.7
In [30]:
# =======================================================
# Sentimentanalyse auf Zusammenfassungen (Summary)
# -- Berechnung der Polarität für jede Zusammenfassung
# =======================================================
In [31]:
polarity=[]
for i in df['Summary']:
try:
polarity.append(TextBlob(i).sentiment.polarity)
except:
polarity.append(0)
In [32]:
len(polarity)
Out[32]:
568454
In [33]:
# =======================================================
# Hinzufügen der Polarität zum DataFrame
# =======================================================
In [34]:
data=df.copy()
In [35]:
data['Polarity']=polarity
In [36]:
data.head()
Out[36]:
Id | ProductId | UserId | ProfileName | HelpfulnessNumerator | HelpfulnessDenominator | Score | Time | Summary | Text | Polarity | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | B001E4KFG0 | A3SGXH7AUHU8GW | delmartian | 1 | 1 | 5 | 1303862400 | Good Quality Dog Food | I have bought several of the Vitality canned d... | 0.7 |
1 | 2 | B00813GRG4 | A1D87F6ZCVE5NK | dll pa | 0 | 0 | 1 | 1346976000 | Not as Advertised | Product arrived labeled as Jumbo Salted Peanut... | 0.0 |
2 | 3 | B000LQOCH0 | ABXLMWJIXXAIN | Natalia Corres "Natalia Corres" | 1 | 1 | 4 | 1219017600 | "Delight" says it all | This is a confection that has been around a fe... | 0.0 |
3 | 4 | B000UA0QIQ | A395BORC6FGVXV | Karl | 3 | 3 | 2 | 1307923200 | Cough Medicine | If you are looking for the secret ingredient i... | 0.0 |
4 | 5 | B006K2ZZ7K | A1UQRSCLF8GW1T | Michael D. Bigham "M. Wassir" | 0 | 0 | 5 | 1350777600 | Great taffy | Great taffy at a great price. There was a wid... | 0.8 |
In [37]:
# =======================================================
# Positive Zusammenfassungen extrahieren
# =======================================================
In [38]:
data_positive=data[data['Polarity']>0]
In [39]:
data_positive.shape
Out[39]:
(331661, 11)
In [40]:
# =======================================================
# Wordcloud für positive Zusammenfassungen
# =======================================================
In [41]:
!pip install wordcloud
from wordcloud import WordCloud, STOPWORDS
Requirement already satisfied: wordcloud in c:\users\miso\anaconda3\lib\site-packages (1.9.4) Requirement already satisfied: numpy>=1.6.1 in c:\users\miso\anaconda3\lib\site-packages (from wordcloud) (2.1.3) Requirement already satisfied: pillow in c:\users\miso\anaconda3\lib\site-packages (from wordcloud) (11.1.0) Requirement already satisfied: matplotlib in c:\users\miso\anaconda3\lib\site-packages (from wordcloud) (3.10.0) Requirement already satisfied: contourpy>=1.0.1 in c:\users\miso\anaconda3\lib\site-packages (from matplotlib->wordcloud) (1.3.1) Requirement already satisfied: cycler>=0.10 in c:\users\miso\anaconda3\lib\site-packages (from matplotlib->wordcloud) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in c:\users\miso\anaconda3\lib\site-packages (from matplotlib->wordcloud) (4.55.3) Requirement already satisfied: kiwisolver>=1.3.1 in c:\users\miso\anaconda3\lib\site-packages (from matplotlib->wordcloud) (1.4.8) Requirement already satisfied: packaging>=20.0 in c:\users\miso\anaconda3\lib\site-packages (from matplotlib->wordcloud) (24.2) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\miso\anaconda3\lib\site-packages (from matplotlib->wordcloud) (3.2.0) Requirement already satisfied: python-dateutil>=2.7 in c:\users\miso\anaconda3\lib\site-packages (from matplotlib->wordcloud) (2.9.0.post0) Requirement already satisfied: six>=1.5 in c:\users\miso\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib->wordcloud) (1.17.0)
In [42]:
stopwords=set(STOPWORDS)
In [43]:
data_positive.head()
Out[43]:
Id | ProductId | UserId | ProfileName | HelpfulnessNumerator | HelpfulnessDenominator | Score | Time | Summary | Text | Polarity | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | B001E4KFG0 | A3SGXH7AUHU8GW | delmartian | 1 | 1 | 5 | 1303862400 | Good Quality Dog Food | I have bought several of the Vitality canned d... | 0.700000 |
4 | 5 | B006K2ZZ7K | A1UQRSCLF8GW1T | Michael D. Bigham "M. Wassir" | 0 | 0 | 5 | 1350777600 | Great taffy | Great taffy at a great price. There was a wid... | 0.800000 |
5 | 6 | B006K2ZZ7K | ADT0SRK1MGOEU | Twoapennything | 0 | 0 | 4 | 1342051200 | Nice Taffy | I got a wild hair for taffy and ordered this f... | 0.600000 |
6 | 7 | B006K2ZZ7K | A1SP2KVKFXXRU1 | David C. Sullivan | 0 | 0 | 5 | 1340150400 | Great! Just as good as the expensive brands! | This saltwater taffy had great flavors and was... | 0.358333 |
7 | 8 | B006K2ZZ7K | A3JRGQVEQN31IQ | Pamela G. Williams | 0 | 0 | 5 | 1336003200 | Wonderful, tasty taffy | This taffy is so good. It is very soft and ch... | 1.000000 |
In [44]:
total_text=(' '.join(data_positive['Summary']))
In [45]:
len(total_text)
Out[45]:
8464853
In [46]:
total_text[0:1000]
Out[46]:
'Good Quality Dog Food Great taffy Nice Taffy Great! Just as good as the expensive brands! Wonderful, tasty taffy Healthy Dog Food The Best Hot Sauce in the World My cats LOVE this "diet" food better than their regular food My Cats Are Not Fans of the New Food fresh and greasy! Love it! GREAT SWEET CANDY! Always fresh Delicious product! Great Bargain for the Price The Best Hot Sauce in the World Great machine! Best of the Instant Oatmeals Good Instant Great Irish oatmeal for those in a hurry! satisfying Love Gluten Free Oatmeal!!! GOOD WAY TO START THE DAY.... Wife\'s favorite Breakfast Why wouldn\'t you buy oatmeal from Mcanns? Tastes great! Good Hot Breakfast Great taste and convenience good Very good but next time I won\'t order the Variety Pack HOT! And good! Came back for more :) Roasts up a smooth brew Our guests love it! Awesome Deal! Awsome - Kids in neighborhood loved us! great deal. Better price for this at Target great source of electrolytes Great for preventing cramps Taste'
In [47]:
import re
total_text=re.sub('[^a-zA-Z]',' ',total_text)
In [48]:
total_text[0:2000]
Out[48]:
'Good Quality Dog Food Great taffy Nice Taffy Great Just as good as the expensive brands Wonderful tasty taffy Healthy Dog Food The Best Hot Sauce in the World My cats LOVE this diet food better than their regular food My Cats Are Not Fans of the New Food fresh and greasy Love it GREAT SWEET CANDY Always fresh Delicious product Great Bargain for the Price The Best Hot Sauce in the World Great machine Best of the Instant Oatmeals Good Instant Great Irish oatmeal for those in a hurry satisfying Love Gluten Free Oatmeal GOOD WAY TO START THE DAY Wife s favorite Breakfast Why wouldn t you buy oatmeal from Mcanns Tastes great Good Hot Breakfast Great taste and convenience good Very good but next time I won t order the Variety Pack HOT And good Came back for more Roasts up a smooth brew Our guests love it Awesome Deal Awsome Kids in neighborhood loved us great deal Better price for this at Target great source of electrolytes Great for preventing cramps Taste is not so good How much would you pay for a bag of chocolate pretzels Great Gummi Best ever latice tart nothing special Good Taste great this is the best Delicious Great Natural Balance Lamb and Rice Great food Great for my dogs allergies Great for stomach problems Better life for you dog Great Food Great food for my my dog who has a sensitive stomach Great dog food Mmmmm Mmmmm good Great Dog Food Good healthy dog food Great dog food Great allergy sensitive dog food dogs love it Perfect for our English Bulldog with Allergies Taste wise it is a star item Great Support Loved these Tartlets The best My Idea of a Good Diet Food Delicious tea the best tea ever freah bright clean Wonderful Tea Great cookies Best everyday cookie So Far So Good Best Cat Food Great food Perfect Cat Food For Older Cats Good for Feline UTI Palatable and healthy Healthy They LOVE It Wonderful food perfect for allergic kitties Tastes great Love Hot Spicy Bad price here My favorite ra'
In [49]:
total_text=re.sub(' +',' ', total_text)
In [50]:
total_text[0:10000]
Out[50]:
'Good Quality Dog Food Great taffy Nice Taffy Great Just as good as the expensive brands Wonderful tasty taffy Healthy Dog Food The Best Hot Sauce in the World My cats LOVE this diet food better than their regular food My Cats Are Not Fans of the New Food fresh and greasy Love it GREAT SWEET CANDY Always fresh Delicious product Great Bargain for the Price The Best Hot Sauce in the World Great machine Best of the Instant Oatmeals Good Instant Great Irish oatmeal for those in a hurry satisfying Love Gluten Free Oatmeal GOOD WAY TO START THE DAY Wife s favorite Breakfast Why wouldn t you buy oatmeal from Mcanns Tastes great Good Hot Breakfast Great taste and convenience good Very good but next time I won t order the Variety Pack HOT And good Came back for more Roasts up a smooth brew Our guests love it Awesome Deal Awsome Kids in neighborhood loved us great deal Better price for this at Target great source of electrolytes Great for preventing cramps Taste is not so good How much would you pay for a bag of chocolate pretzels Great Gummi Best ever latice tart nothing special Good Taste great this is the best Delicious Great Natural Balance Lamb and Rice Great food Great for my dogs allergies Great for stomach problems Better life for you dog Great Food Great food for my my dog who has a sensitive stomach Great dog food Mmmmm Mmmmm good Great Dog Food Good healthy dog food Great dog food Great allergy sensitive dog food dogs love it Perfect for our English Bulldog with Allergies Taste wise it is a star item Great Support Loved these Tartlets The best My Idea of a Good Diet Food Delicious tea the best tea ever freah bright clean Wonderful Tea Great cookies Best everyday cookie So Far So Good Best Cat Food Great food Perfect Cat Food For Older Cats Good for Feline UTI Palatable and healthy Healthy They LOVE It Wonderful food perfect for allergic kitties Tastes great Love Hot Spicy Bad price here My favorite ramen Amazing to the last bite Great spicy flavor Great value and convenient ramen great flavor Tastes great but is cheaper locally Tastes awesome looks beautiful Happy Face Simply the BEST Excellent Product Life Saver Nice snack Good Licorice I love these Great for the kids Sweet with a nice kick Love the salsa awesome cornmeal GREAT marinade Awesome stuff tastes good Great flavor of Jell o Great Deal Great tasting sea salt WITH iodine tastes very fresh Simple but good Not the greatest tasting Not Bad Right size taste This stuff is sooooo good Best Stuff Ever Worked great Delicious Fluffy Soft Delicious and Sugary Sweet Great but not as good as it was back in the day as a teen EXCELLENT LEMON JUICE Great Product Handy Never paid that much Great product to help you sleep Perfect for gluten free chocolate chip cookies Make a fresh fruit tart light and beautiful not bad for instant healthy coffee It s ok I love it great taste and has health benefits Tastes Great Arrived in days Great for after lunch Nice little mints but pricey These mints are awesome Love these And reusable containers A huge hit at the office Love em they re great Love these fresh better than average more expensive than average Great For Fat Cats and Senior Citizens Best by the case More Hot Spicy than McCormick s Brand Ahmad Loose Imperial Blend Tea is great for the price Nice tea Best Ahmad Tea My favorite tea Best tea ever DELICIOUS Best Bloody Mary mixer The Best Love this tea Really Nice Taste High Quality But it gave my dog wicked gas Great tasting green tea and such a great deal OMG best chocolate jelly belly Excellent loose tea Good anytime hot tea Wonderful Best way to buy kcups delicious Super SuperFoods are Super easy Best Energy Shot For Me Great for Gluten free lifestyle Excellent but not perfect Good product Thanks for the review Scott great Awesome Sugar Great product weak packaging Excellent Excellent for G F Amazing Very tasty chips Excellent Taste it s fabulous but not from amazon Not mild enough for me lol Great Natural Energy Great Energy The best energy shot out there smooth and organic Fantastic natural energy Way better than Guayaki Doesn t taste that good but provides you the energy Favorite energy shot and all natural too natural energy boost Best energy shot I have ever tasted The Best Good Stuff Great energy drink without artificial ingredients Flavor getting better energy is great Fantastic WOW Very Dissapointed Very Good Coffee Very Tasty Excellent coffee Hot Hot and delicious ABSOLUTELY DELICIOUS Great gag gift arrived FAST No broken creamers Shipped great Better Packaging Perfect for work Yes this is real excellent coffee Does not taste very good Love Love Love These great for eating whole foods clean with veggie brush Absolutely LOVE IT Only good for ice Great for teething Wonderful idea difficult to clean I wasn t that impressed Love the Fresh Food Feeder Great Beans Good stuff excellent exactly what I expected These are the Best Love Love Love The product is great but the price is out of line Perfect great taste Excellent Everyday Olive Oil Love Weavers I am a fan Treat yourself to the best coffee Drinking it now love the latin america aroma GREAT SNACK Best Bar My New Granola Bar Another Husband Favorite Very Smooth Coffee Highly Recommended My favorite Good Coffee Greatest Oil since slice bread Best Ever Deliciously scrumptious This is really good stuff Porcini Mushrooms an excellent product Excellent flavor mostly large pieces The Best Good for the money not the highest quality but good for the price Fresh Whole perfect Fresh and Tasty Cat won t go near it Simply WIld Chick Brown RIce for Cats A Great All Around Mix Great mix Perfect mix for egg allergic Arrowhead Mills whole grain buttermilk Pancakes are easy Good for Egg Allergy Great Healthy Snack Sweet and Soothing A Fantastic Healthy Product Great product Excellent tea best roast ever Franch s is the best VERY GOOD Great taste and easy for a single guy Look elsewhere for your whole grains These are Famous for a reason Wow God I love these cookies Fresh Lightly Spiced Crunchy Kettle Chips Good Value Good Product glad to find them in oz size pretty good could be better Best chips ever Kettle potato chips Sweet onion Ridiculously Good Delicious I love these chips They are thick and crunchy Quite good Delicious WOW Best gluten free dairy free chips A unique flavor for fans of Thai food Honey Dijon leaves bad aftertaste NY Cheddar are pretty good Very good Excellent A delicious crisp chip with good flavor BEST BUY in BBQ Chips Love Them Best deal ever Excellent Thai flavored chip Best Kettle Chips Delicious as always Not quite the best One of Their Best Flavors Love these chips Highly addicitive chips These are AWESOME By far my favorite chips Good chips more cheese Pretty good tasting chip Best sour cream onion chip I ve had Great chips Fabulous Great Chip Excellent balance of taste crunchiness and moisture Very good chips at a great price Great chips Good and tangy the best chips ever I do not even like kettle chips and I love these You have to love sea salt and vinegar already amazing chips Best Chip Ever Tangy spicy and sweet oh my The best I ve had Excellent chip Delicious Love Kettle Chips Best unsalted chips Love Kettle Chips but not this flavor Crisp Delicious what else did you expect Great Value I have had better Jalapeno Kettle Chips Spicy but good boulder salt and malt vinegar chips are way better Lightly Salted Heavily Delicious Too Much Flavor Love at first bite Tongue puckering tang and crunch The Best Chips PERIOD Delicious Extra Crunchy Best Salt Vinegar Gourmet powerful Salt Vinegar chips Great deal Best Chips out there Great price but not as tangy as I expected Absotively Posilutely Delicious completely ripped off C H I P C H I P H O O R A Y B A C K Y A R D B A R B E C U E Not the Best THESE ARE VERY GOOD USED to be my favorite chips Not as good as the English sell Not so good delicious Kettle Chips Make Great Mouse Food great hot new flavor Favorite Kettle flavor and a great value Kettle Brand Potato Chips New York Cheddar My favorite flavor So much flavor your farts will smell like sweet onions Great Chip Awesome and delicious Good chips YUM If you want a snack have something REALLY good a good buy Good chips Awesome Great chips with very low sodium My favorite Kettle Chip Best salt vinegar chips out there Amazing Service Pretty tasty and decently spiced Great Tasting Chips Good deal but close expiration date GREAT TASTING CHIPS Buy These Eat These Be Happy Fantastic Sweet salty tangy the way a snack should be Some of the best chips anywhere Delicious Lightly salted yet tasty Crunch Wow Great strong flavor Best Chips Out There These chips tasted good awesome chips Delicious Tangy and delicious snack Best Chip Best Chips I ve Ever Tasted Love the smaller bags Ok but Miss Vickie s Are Better These chips are awesome if not best but GREAT DEAL What a great tea at this price delicious Great well balanced Earl Grey Best Earl Grey ever favorite Earl Grey tea Delicious The best A HUGE Success Do not taste from bottle Mix with vanilla for true flavor The oldest soft drink is still the best Caramel flavor excellent for baking and toppings tips for using agave too Great buy excellent sweetner Good but container could be better Great stuff Healthy Sweetener Great way replacing the sugar Great substitute sweetener The Best Healthy Stuff Sweet success great product Best price on agave nectar that I ve found How this could be good Best tea I ever had Who needs salsa when chips taste this good Delicious Organic yummy chips what more can you ask for These are the best widely available bbq chips My favorite chips from Kettle Amazing Taste Best Chip Ever Plocky s Sweet Smokey Chipotle Whole Grain Tortill The best tortilla chips I have ever eaten Such an excellent chip Great Huge fan of these chips Plocky s tortilla chips tasty and healthy Tasty but make sure you have gum delicious and healthy Unique schrumshist and tasty Tortilla Chips Best kept secret Delicious'
In [51]:
# =======================================================
# Negative Zusammenfassungen analysieren & Wordcloud
# =======================================================
In [52]:
wordcloud=WordCloud(width=1000,height=500,stopwords=stopwords).generate(total_text)
plt.figure(figsize=(15,5))
plt.imshow(wordcloud)
plt.axis('off')
Out[52]:
(np.float64(-0.5), np.float64(999.5), np.float64(499.5), np.float64(-0.5))
In [53]:
data_negative=data[data['Polarity']<0]
In [54]:
total_text2=(' '.join(data_negative['Summary']))
In [55]:
total_text2=re.sub('[^a-zA-Z]',' ',total_text2)
In [56]:
total_text2=re.sub(' +',' ', total_text2)
In [57]:
wordcloud2=WordCloud(width=1000,height=500,stopwords=stopwords).generate(total_text2)
plt.figure(figsize=(15,5))
plt.imshow(wordcloud2)
plt.axis('off')
Out[57]:
(np.float64(-0.5), np.float64(999.5), np.float64(499.5), np.float64(-0.5))
In [58]:
df.head()
Out[58]:
Id | ProductId | UserId | ProfileName | HelpfulnessNumerator | HelpfulnessDenominator | Score | Time | Summary | Text | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | B001E4KFG0 | A3SGXH7AUHU8GW | delmartian | 1 | 1 | 5 | 1303862400 | Good Quality Dog Food | I have bought several of the Vitality canned d... |
1 | 2 | B00813GRG4 | A1D87F6ZCVE5NK | dll pa | 0 | 0 | 1 | 1346976000 | Not as Advertised | Product arrived labeled as Jumbo Salted Peanut... |
2 | 3 | B000LQOCH0 | ABXLMWJIXXAIN | Natalia Corres "Natalia Corres" | 1 | 1 | 4 | 1219017600 | "Delight" says it all | This is a confection that has been around a fe... |
3 | 4 | B000UA0QIQ | A395BORC6FGVXV | Karl | 3 | 3 | 2 | 1307923200 | Cough Medicine | If you are looking for the secret ingredient i... |
4 | 5 | B006K2ZZ7K | A1UQRSCLF8GW1T | Michael D. Bigham "M. Wassir" | 0 | 0 | 5 | 1350777600 | Great taffy | Great taffy at a great price. There was a wid... |
In [59]:
# =======================================================
# Nutzeranalyse – Top-User nach Bewertungen
# =======================================================
In [60]:
df['UserId'].nunique()
Out[60]:
256059
In [61]:
raw=df.groupby('UserId').agg({'Summary':'count','Text':'count','Score':'mean','ProductId':'count'}).sort_values(by='Text',ascending=False)
In [62]:
raw
Out[62]:
Summary | Text | Score | ProductId | |
---|---|---|---|---|
UserId | ||||
A3OXHLG6DIBRW8 | 448 | 448 | 4.535714 | 448 |
A1YUL9PCJR3JTY | 421 | 421 | 4.494062 | 421 |
AY12DBB0U420B | 389 | 389 | 4.647815 | 389 |
A281NPSIMI1C2R | 365 | 365 | 4.841096 | 365 |
A1Z54EM24Y40LL | 256 | 256 | 4.453125 | 256 |
... | ... | ... | ... | ... |
AZZQLMNX239VT | 1 | 1 | 5.000000 | 1 |
AZZP14UZ813US | 1 | 1 | 5.000000 | 1 |
AZZOMF6HZYFL7 | 1 | 1 | 2.000000 | 1 |
AZZV61COVM8CA | 1 | 1 | 5.000000 | 1 |
AZZUQYE2C1LNI | 1 | 1 | 4.000000 | 1 |
256059 rows × 4 columns
In [63]:
raw.columns=['Number_of_summaries','num_text','Avg_Score','no_of_products_purchased']
raw
Out[63]:
Number_of_summaries | num_text | Avg_Score | no_of_products_purchased | |
---|---|---|---|---|
UserId | ||||
A3OXHLG6DIBRW8 | 448 | 448 | 4.535714 | 448 |
A1YUL9PCJR3JTY | 421 | 421 | 4.494062 | 421 |
AY12DBB0U420B | 389 | 389 | 4.647815 | 389 |
A281NPSIMI1C2R | 365 | 365 | 4.841096 | 365 |
A1Z54EM24Y40LL | 256 | 256 | 4.453125 | 256 |
... | ... | ... | ... | ... |
AZZQLMNX239VT | 1 | 1 | 5.000000 | 1 |
AZZP14UZ813US | 1 | 1 | 5.000000 | 1 |
AZZOMF6HZYFL7 | 1 | 1 | 2.000000 | 1 |
AZZV61COVM8CA | 1 | 1 | 5.000000 | 1 |
AZZUQYE2C1LNI | 1 | 1 | 4.000000 | 1 |
256059 rows × 4 columns
In [64]:
# =======================================================
# Visualisierung der 10 aktivsten Nutzer
# =======================================================
In [65]:
user_10=raw.index[0:10]
In [66]:
number_10=raw['no_of_products_purchased'][0:10]
In [67]:
plt.bar(user_10,number_10,label='most recommended users')
plt.xlabel('User_Id')
plt.ylabel('Number of products purchased')
plt.xticks(rotation='vertical')
Out[67]:
([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [Text(0, 0, 'A3OXHLG6DIBRW8'), Text(1, 0, 'A1YUL9PCJR3JTY'), Text(2, 0, 'AY12DBB0U420B'), Text(3, 0, 'A281NPSIMI1C2R'), Text(4, 0, 'A1Z54EM24Y40LL'), Text(5, 0, 'A1TMAVN4CEM8U8'), Text(6, 0, 'A2MUGFV2TDQ47K'), Text(7, 0, 'A3TVZM3ZIXG8YW'), Text(8, 0, 'A3PJZ8TU8FDQ1K'), Text(9, 0, 'AQQLWCMRNDFGI')])
In [68]:
# =======================================================
# Stichprobe und Datenbereinigung
# =======================================================
In [69]:
df.sample(n=2000)
Out[69]:
Id | ProductId | UserId | ProfileName | HelpfulnessNumerator | HelpfulnessDenominator | Score | Time | Summary | Text | |
---|---|---|---|---|---|---|---|---|---|---|
546982 | 546983 | B000AYGXKC | A309XRV5MKT6ZS | Happy Customer | 0 | 0 | 5 | 1246233600 | Very Good Sugar Free Cookies | Murray's Sugar Free Peanut Butter cookies are ... |
125594 | 125595 | B0029NJ16K | A2GGBMCEGBMYQ5 | JOI "*PUMKIN*" | 0 | 0 | 5 | 1231027200 | THE BEST EVER | THIS IS THE BEST DOGIE FOOD EVER, MY POODLE OD... |
201808 | 201809 | B001EQ59AK | AEIALC4KRI314 | Dr. Cheryl Ann Dusty "Cherokee Angel" | 2 | 2 | 5 | 1314921600 | A must for every cook | This product is a must have amoung your kitche... |
5908 | 5909 | B001DIM8K8 | AV5PTOA4JL8TG | Sissy Jollie "Practical Gourmet" | 3 | 3 | 5 | 1237680000 | Best Oatmeal Ever! | This is the only oatmeal that my family will e... |
11947 | 11948 | B001CD1VI4 | A1HWCQGV41JRYV | M. Hudson | 0 | 1 | 5 | 1254096000 | Yummy! | I love these cookies! They are just perfect f... |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
284003 | 284004 | B000PKMMQU | A1Y8PGTWFL5D0V | Customer | 3 | 3 | 1 | 1324684800 | No taste | Very unhappy with this purchase. There was es... |
450964 | 450965 | B001OCKIBY | A7T1NIWZHTC4P | Seven Kitties "7kitties" | 0 | 0 | 4 | 1245801600 | Yummy but.... | The packaging on the box compares this to a tr... |
175719 | 175720 | B003EJ9KLO | A3I0Z04OY8PFNF | Missi | 0 | 0 | 5 | 1285027200 | Love this mix! | This is an easy-to-use, healthy pancake mix. ... |
264918 | 264919 | B005O8BJU8 | ARA0UAUUO4B6X | Pauline N. Borderies "Pauline" | 0 | 0 | 5 | 1349481600 | We love Ella's Kitchen | The brekkie series are great because they are ... |
179439 | 179440 | B000CQC050 | AP1PTF85IH674 | Wendi A. Pilling | 3 | 3 | 5 | 1276646400 | Best Peppermint Tea We Have Found | My husband has IBS and peppermint tea is essen... |
2000 rows × 10 columns
In [70]:
final=df[0:2000]
In [71]:
final.head()
Out[71]:
Id | ProductId | UserId | ProfileName | HelpfulnessNumerator | HelpfulnessDenominator | Score | Time | Summary | Text | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | B001E4KFG0 | A3SGXH7AUHU8GW | delmartian | 1 | 1 | 5 | 1303862400 | Good Quality Dog Food | I have bought several of the Vitality canned d... |
1 | 2 | B00813GRG4 | A1D87F6ZCVE5NK | dll pa | 0 | 0 | 1 | 1346976000 | Not as Advertised | Product arrived labeled as Jumbo Salted Peanut... |
2 | 3 | B000LQOCH0 | ABXLMWJIXXAIN | Natalia Corres "Natalia Corres" | 1 | 1 | 4 | 1219017600 | "Delight" says it all | This is a confection that has been around a fe... |
3 | 4 | B000UA0QIQ | A395BORC6FGVXV | Karl | 3 | 3 | 2 | 1307923200 | Cough Medicine | If you are looking for the secret ingredient i... |
4 | 5 | B006K2ZZ7K | A1UQRSCLF8GW1T | Michael D. Bigham "M. Wassir" | 0 | 0 | 5 | 1350777600 | Great taffy | Great taffy at a great price. There was a wid... |
In [72]:
final.isnull().sum()
Out[72]:
Id 0 ProductId 0 UserId 0 ProfileName 0 HelpfulnessNumerator 0 HelpfulnessDenominator 0 Score 0 Time 0 Summary 0 Text 0 dtype: int64
In [73]:
final.duplicated().sum()
Out[73]:
np.int64(0)
In [74]:
final['Text'][0].split(' ')
Out[74]:
['I', 'have', 'bought', 'several', 'of', 'the', 'Vitality', 'canned', 'dog', 'food', 'products', 'and', 'have', 'found', 'them', 'all', 'to', 'be', 'of', 'good', 'quality.', 'The', 'product', 'looks', 'more', 'like', 'a', 'stew', 'than', 'a', 'processed', 'meat', 'and', 'it', 'smells', 'better.', 'My', 'Labrador', 'is', 'finicky', 'and', 'she', 'appreciates', 'this', 'product', 'better', 'than', '', 'most.']
In [75]:
# =======================================================
# Berechnung der Textlänge in Wörtern
# =======================================================
In [76]:
def calc_len(text):
return len(text.split(' '))
In [77]:
final['Text_length']=final['Text'].apply(calc_len)
In [78]:
!pip install plotly
Requirement already satisfied: plotly in c:\users\miso\anaconda3\lib\site-packages (5.24.1) Requirement already satisfied: tenacity>=6.2.0 in c:\users\miso\anaconda3\lib\site-packages (from plotly) (9.0.0) Requirement already satisfied: packaging in c:\users\miso\anaconda3\lib\site-packages (from plotly) (24.2)
In [79]:
import plotly.express as px
In [80]:
# =======================================================
# Plotly Boxplot zur Verteilung der Textlängen
# =======================================================
In [81]:
px.box(final,y='Text_length')
In [82]:
sns.countplot(final['Score'])
Out[82]:
<Axes: ylabel='count'>
In [83]:
final.head()
Out[83]:
Id | ProductId | UserId | ProfileName | HelpfulnessNumerator | HelpfulnessDenominator | Score | Time | Summary | Text | Text_length | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | B001E4KFG0 | A3SGXH7AUHU8GW | delmartian | 1 | 1 | 5 | 1303862400 | Good Quality Dog Food | I have bought several of the Vitality canned d... | 49 |
1 | 2 | B00813GRG4 | A1D87F6ZCVE5NK | dll pa | 0 | 0 | 1 | 1346976000 | Not as Advertised | Product arrived labeled as Jumbo Salted Peanut... | 31 |
2 | 3 | B000LQOCH0 | ABXLMWJIXXAIN | Natalia Corres "Natalia Corres" | 1 | 1 | 4 | 1219017600 | "Delight" says it all | This is a confection that has been around a fe... | 99 |
3 | 4 | B000UA0QIQ | A395BORC6FGVXV | Karl | 3 | 3 | 2 | 1307923200 | Cough Medicine | If you are looking for the secret ingredient i... | 43 |
4 | 5 | B006K2ZZ7K | A1UQRSCLF8GW1T | Michael D. Bigham "M. Wassir" | 0 | 0 | 5 | 1350777600 | Great taffy | Great taffy at a great price. There was a wid... | 30 |
In [84]:
# =======================================================
# Textvorverarbeitung – Kleinschreibung & Zeichen filtern
# =======================================================
In [85]:
final['Text']=final['Text'].str.lower()
In [86]:
final['Text'][164]
Out[86]:
'seriously this product was as tasteless as they come. there are much better tasting products out there but at 100 calories its better than a special k bar or cookie snack pack. you just have to season it or combine it with something else to share the flavor.'
In [87]:
import re
re.sub('[^a-zA-Z]',' ',final['Text'][164])
Out[87]:
'seriously this product was as tasteless as they come there are much better tasting products out there but at calories its better than a special k bar or cookie snack pack you just have to season it or combine it with something else to share the flavor '
In [88]:
# =======================================================
# Entfernen von Satzzeichen mit Python string.punctuation
# =======================================================
In [89]:
punctuations='''!()[]{}:;'"\<>.?/@#$%*_-'''
data=final['Text'][164]
no_punc=''
for char in data:
if char not in punctuations:
no_punc=no_punc + char
no_punc
Out[89]:
'seriously this product was as tasteless as they come there are much better tasting products out there but at 100 calories its better than a special k bar or cookie snack pack you just have to season it or combine it with something else to share the flavor'
In [90]:
import string
punctuations = string.punctuation
def remove_punc(review):
no_punc=''
for char in review:
if char not in punctuations:
no_punc=no_punc + char
return no_punc
In [91]:
final['Text']=final['Text'].apply(remove_punc)
In [92]:
final.head()
Out[92]:
Id | ProductId | UserId | ProfileName | HelpfulnessNumerator | HelpfulnessDenominator | Score | Time | Summary | Text | Text_length | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | B001E4KFG0 | A3SGXH7AUHU8GW | delmartian | 1 | 1 | 5 | 1303862400 | Good Quality Dog Food | i have bought several of the vitality canned d... | 49 |
1 | 2 | B00813GRG4 | A1D87F6ZCVE5NK | dll pa | 0 | 0 | 1 | 1346976000 | Not as Advertised | product arrived labeled as jumbo salted peanut... | 31 |
2 | 3 | B000LQOCH0 | ABXLMWJIXXAIN | Natalia Corres "Natalia Corres" | 1 | 1 | 4 | 1219017600 | "Delight" says it all | this is a confection that has been around a fe... | 99 |
3 | 4 | B000UA0QIQ | A395BORC6FGVXV | Karl | 3 | 3 | 2 | 1307923200 | Cough Medicine | if you are looking for the secret ingredient i... | 43 |
4 | 5 | B006K2ZZ7K | A1UQRSCLF8GW1T | Michael D. Bigham "M. Wassir" | 0 | 0 | 5 | 1350777600 | Great taffy | great taffy at a great price there was a wide... | 30 |
In [93]:
data=final['Text'][164]
data
Out[93]:
'seriously this product was as tasteless as they come there are much better tasting products out there but at 100 calories its better than a special k bar or cookie snack pack you just have to season it or combine it with something else to share the flavor'
In [94]:
!pip install nltk
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
Requirement already satisfied: nltk in c:\users\miso\anaconda3\lib\site-packages (3.9.1) Requirement already satisfied: click in c:\users\miso\anaconda3\lib\site-packages (from nltk) (8.1.8) Requirement already satisfied: joblib in c:\users\miso\anaconda3\lib\site-packages (from nltk) (1.4.2) Requirement already satisfied: regex>=2021.8.3 in c:\users\miso\anaconda3\lib\site-packages (from nltk) (2024.11.6) Requirement already satisfied: tqdm in c:\users\miso\anaconda3\lib\site-packages (from nltk) (4.67.1) Requirement already satisfied: colorama in c:\users\miso\anaconda3\lib\site-packages (from click->nltk) (0.4.6)
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\Miso\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date!
Out[94]:
True
In [95]:
# =======================================================
# Entfernen von Stopwörtern mit nltk
# =======================================================
In [96]:
re=[word for word in data.split(' ') if word not in set(stopwords.words('english'))]
str=''
for wd in re:
str=str+wd
str=str+' '
str
Out[96]:
'seriously product tasteless come much better tasting products 100 calories better special k bar cookie snack pack season combine something else share flavor '
In [97]:
def remove_stopwords(review):
return ' '.join([word for word in review.split(' ') if word not in set(stopwords.words('english'))])
In [98]:
final['Text']=final['Text'].apply(remove_stopwords)
In [99]:
final['Text'][45]
Out[99]:
'seems little wholesome supermarket brands somewhat mushy doesnt quite much flavor either didnt pass muster kids probably wont buy'
In [100]:
final['Text'].str.contains('http?').sum()
Out[100]:
np.int64(35)
In [117]:
pd.set_option('display.max_rows',2000)
final['Text'].str.contains('http?');
In [102]:
review=final['Text'][21]
review
Out[102]:
'bought husband currently overseas loves apparently staff likes alsobr generous amounts twizzlers 16ounce bag well worth price hrefhttpwwwamazoncomgpproductb001gvisjmtwizzlers strawberry 16ounce bags pack 6a'
In [103]:
import re
In [104]:
url_pattern = re.compile(r'href\S*|http\S+')
url_pattern.sub(r'',review)
Out[104]:
'bought husband currently overseas loves apparently staff likes alsobr generous amounts twizzlers 16ounce bag well worth price strawberry 16ounce bags pack 6a'
In [105]:
# =======================================================
# Entfernen von URLs in den Texten
# =======================================================
In [106]:
def remove_urls(review):
url_pattern=re.compile(r'href|http.\w+')
return url_pattern.sub(r'',review)
In [107]:
final['Text']=final['Text'].apply(remove_urls)
In [108]:
final['Text'][21]
Out[108]:
'bought husband currently overseas loves apparently staff likes alsobr generous amounts twizzlers 16ounce bag well worth price strawberry 16ounce bags pack 6a'
In [109]:
final['Text'].str.contains('http?').sum()
Out[109]:
np.int64(0)
In [110]:
final['Text'][34].replace('br','')
Out[110]:
'instant oatmeal become soggy minute water hits bowl mccanns instant oatmeal holds texture excellent flavor good time mccanns regular oat meal excellent may take bit longer prepare time morning best instant and ive ever eaten close second noninstant variety mccanns instant irish oatmeal variety pack regular apples cinnamon maple own sugar 10count boxes pack 6'
In [111]:
import warnings
from warnings import filterwarnings
filterwarnings('ignore')
In [112]:
for i in range(len(final['Text'])):
final['Text'][i]=final['Text'][i].replace('br','')
In [113]:
comment_words=' '.join(final['Text'])
In [114]:
# =======================================================
# Wordcloud für bereinigten Text
# =======================================================
In [115]:
stopwords=set(STOPWORDS)
In [116]:
wordcloud=WordCloud(width=800,height=800,stopwords=stopwords).generate(comment_words)
plt.figure(figsize=(8,8))
plt.imshow(wordcloud)
plt.axis('off')
Out[116]:
(np.float64(-0.5), np.float64(799.5), np.float64(799.5), np.float64(-0.5))