import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import re
import tweepy
from tweepy import OAuthHandler
import json
from timeit import default_timer as timer
import requests
# Gathering the data
#### Reading twitter archive enhanced file ####
df_twitter_archive=pd.read_csv('twitter-archive-enhanced.csv')
df_twitter_archive
### Requesting image predictions from URL ###
url = '<https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv>'
req = requests.get(url, allow_redirects=True)
file_name = url.rsplit('/', 1)[1]
with open(file_name,'wb') as f:
for chunk in req.iter_content(chunk_size=800):
if chunk:
f.write(chunk)
df_images=pd.read_table('image-predictions.tsv')
df_images.head()
# Query Twitter API for each tweet in the Twitter archive and save JSON in a text file
# These are hidden to comply with Twitter's API terms and conditions
consumer_key = 'HIDDEN'
consumer_secret = 'HIDDEN'
access_token = 'HIDDEN'
access_secret = 'HIDDEN'
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)
# NOTE TO STUDENT WITH MOBILE VERIFICATION ISSUES:
# df_1 is a DataFrame with the twitter_archive_enhanced.csv file. You may have to
# change line 17 to match the name of your DataFrame with twitter_archive_enhanced.csv
# NOTE TO REVIEWER: this student had mobile verification issues so the following
# Twitter API code was sent to this student from a Udacity instructor
# Tweet IDs for which to gather additional data via Twitter's API
tweet_ids = df_1.tweet_id.values
len(tweet_ids)
# Query Twitter's API for JSON data for each tweet ID in the Twitter archive
count = 0
fails_dict = {}
start = timer()
# Save each tweet's returned JSON as a new line in a .txt file
with open('tweet_json.txt', 'w') as outfile:
# This loop will likely take 20-30 minutes to run because of Twitter's rate limit
for tweet_id in tweet_ids:
count += 1
print(str(count) + ": " + str(tweet_id))
try:
tweet = api.get_status(tweet_id, tweet_mode='extended')
print("Success")
json.dump(tweet._json, outfile)
outfile.write('\\n')
except tweepy.TweepError as e:
print("Fail")
fails_dict[tweet_id] = e
pass
end = timer()
print(end - start)
print(fails_dict)
empty_lst = []
with open('tweet-json.txt') as json_file:
for line in json_file:
tweet = json.loads(line)
tweet_id = tweet['id']
retweet_count = tweet['retweet_count']
fav_count = tweet['favorite_count']
empty_lst.append({'tweet_id':tweet_id,
'retweet_count': retweet_count,
'favorite_count': fav_count})
df_api = pd.DataFrame(empty_lst)
df_api
df_twitter_archive.sample(10)
- Missing Values of Calssifications Like doggo, floofer, pupper, puppo
- The missing values are written as none and not NaN
- There are four columns and it should be only one named classification
df_twitter_archive.name.value_counts()
- Wrong names like the name a
- None values for names
df_twitter_archive.rating_numerator
- Some rating numerators are below 10
df_twitter_archive['rating_numerator'][df_twitter_archive['rating_numerator']>20].value_counts
- some ratings are very large
df_twitter_archive[df_twitter_archive['rating_numerator']<10].sample()
df_twitter_archive.text.iloc[2233]
df_twitter_archive[df_twitter_archive['rating_denominator']!=10]
- some rating denominators does not equal 10
df_twitter_archive.info()
- Three ids like twiteer id should be strings.
- Dogs classifications like doggo and floofer should be catogeries.
### Studying the seconnd dataframe ###
df_images.info()
# Assissing For first dataframe:
## Quality Issues:
- Missing Values of Calssifications Like doggo, floofer, pupper, puppo.
- The missing values are written as none and not NaN.
- Wrong names like the name a.
- None values for names.
- Some rating numerators are Wrong, some are very large and some are below 10.
- Three ids like twiteer id should be strings.
- Dogs classifications like doggo and floofer should be categories.
- some rating denominators does not equal 10.
- data with no images should be deleted (Expanded URL = NaN)
- Deleting the rows that retweets and replies have values in it.*
- Deleting columns of retweets and replies and source and expanded url.
## Tideness Issues:
- There are four columns and it should be only one named classification.
- merge three dataframes into one.
# Cleaning
### Taking copies of the dataframes ###
df_twitter_archive_clean=df_twitter_archive.copy()
df_images_clean= df_images.copy()
df_api_clean= df_api.copy()
##### Define
- Delet the IDs that are in first dataframe and not in second one
##### Code
df_twitter_archive_clean= df_twitter_archive_clean[df_twitter_archive_clean.tweet_id.isin(df_images.tweet_id)]
df_twitter_archive_clean.shape[0]
##### Define
- Scrap missing names from text if not there make it as NaN
###### Code
- using regular expressions.
pattern = re.compile(r'(?:name(?:d)?)\\s{1}(?:is\\s)?([A-Za-z]+)')
for index, row in df_twitter_archive_clean.iterrows():
if row['name'][0].islower() or row['name'] == 'None':
try:
c_name = re.findall(pattern, row['text'])[0]
df_twitter_archive_clean.loc[index,'name'] = df_twitter_archive_clean.loc[index,'name'].replace(row['name'], c_name)
except IndexError:
df_twitter_archive_clean.loc[index,'name'] = np.nan
##### Test
df_twitter_archive_clean.name.value_counts()
##### Define
- Delet rows that has values in retweeted_status_id columns
- Delet rows that has no values in expanded_urls.
##### Code
df_twitter_archive_clean = df_twitter_archive_clean[df_twitter_archive_clean['expanded_urls'].notnull()]
##### Test
df_twitter_archive_clean['expanded_urls'].isnull().sum()
##### Define
- Delet the rows that have values in retweeted_status_id column.
- Delet the rows that have values in in_reply_to_status_id column.
##### Code
df_twitter_archive_clean = df_twitter_archive_clean[df_twitter_archive_clean['retweeted_status_id'].isnull()]
df_twitter_archive_clean = df_twitter_archive_clean[df_twitter_archive_clean['in_reply_to_status_id'].isnull()]
##### Test
df_twitter_archive_clean['retweeted_status_id'].count()
df_twitter_archive_clean['in_reply_to_status_id'].count()
df_twitter_archive_clean.shape[0]
##### Define
- Drop Columns that are no longer needed like retweets ids.
##### Code
df_twitter_archive_clean = df_twitter_archive_clean.drop(['in_reply_to_status_id','in_reply_to_user_id','retweeted_status_id'
,'retweeted_status_user_id','retweeted_status_timestamp', 'source', 'expanded_urls'],axis=1)
##### Test
df_twitter_archive_clean.head()
##### Define
- Replace all None Values in the Doggo, floofer ,.. columns with empty charachter
- make all columns as one column called dog_class
- replace all empty charachter with NaN values
# replace None with ""
df_twitter_archive_clean['doggo'] = df_twitter_archive_clean['doggo'].replace('None','')
df_twitter_archive_clean['floofer'] = df_twitter_archive_clean['floofer'].replace('None','')
df_twitter_archive_clean['pupper'] = df_twitter_archive_clean['pupper'].replace('None','')
df_twitter_archive_clean['puppo'] = df_twitter_archive_clean['puppo'].replace('None','')
# Concatenate 4 columns into one column
df_twitter_archive_clean['dog_class'] = df_twitter_archive_clean['doggo'] + df_twitter_archive_clean['floofer'] + df_twitter_archive_clean['pupper'] + df_twitter_archive_clean['puppo']
# replace "" with NaN
df_twitter_archive_clean['dog_class'] = df_twitter_archive_clean['dog_class'].replace('', np.nan)
# drop (doggo, floofer, pupper, puppo) columns
columns_list = ['doggo', 'floofer', 'pupper', 'puppo']
df_twitter_archive_clean.drop(columns_list, inplace=True, axis=1)
##### Test
df_twitter_archive_clean.sample(7)
#### Reseting indexis
df_twitter_archive_clean.reset_index(drop=True, inplace=True)
df_twitter_archive_clean['dog_class'].value_counts()
##### Define
- sperate the classes with space
##### Code
for i in range (1971):
if df_twitter_archive_clean['dog_class'][i]=='doggopupper':
df_twitter_archive_clean['dog_class'][i] = 'doggo pupper'
if df_twitter_archive_clean['dog_class'][i]=='doggopuppo':
df_twitter_archive_clean['dog_class'][i] = 'doggo puppo'
if df_twitter_archive_clean['dog_class'][i]=='doggofloofer':
df_twitter_archive_clean['dog_class'][i] = 'doggo floofer'
##### Test
df_twitter_archive_clean['dog_class'].value_counts()
##### Define
- convert the ratings to float
- get the number of dogs
- claculate the average
# converting to float
df_twitter_archive_clean['rating_numerator'] = df_twitter_archive_clean['rating_numerator'].astype(float)
# scrap the text for the right numerator rating value
df_twitter_archive_clean['rating_numerator'] = df_twitter_archive_clean['text'].str.extract('(\\d+\\.?\\d?\\d?)\\/\\d{1,3}', expand = False).astype('float')
# getting num of dogs for big values of numerator & denominator
df_twitter_archive_clean['dogs_num'] = df_twitter_archive_clean['rating_denominator'][df_twitter_archive_clean['rating_denominator'] >= 20]/10
df_twitter_archive_clean.dogs_num.value_counts()
##### Calculating the average of the rating_numerator
df_twitter_archive_clean['rating_numerator'][df_twitter_archive_clean['rating_numerator'] >= 20] = df_twitter_archive_clean['rating_numerator'][df_twitter_archive_clean['rating_numerator'] >= 20]/df_twitter_archive_clean['dogs_num'][df_twitter_archive_clean['dogs_num']>0]
##### Test
df_twitter_archive_clean['rating_numerator'].value_counts()
##### Define
- Make all denominators =10
##### Code
df_twitter_archive_clean['rating_denominator'][df_twitter_archive_clean['rating_denominator'] != 10]=10
###### Test
df_twitter_archive_clean['rating_denominator'].value_counts()
df_twitter_archive_clean.sample(7)
##### Define
- Making all IDs alike in the three dataframes
##### code
df_api_clean= df_api_clean[df_api_clean.tweet_id.isin(df_twitter_archive_clean.tweet_id)]
df_images_clean= df_images_clean[df_images_clean.tweet_id.isin(df_twitter_archive_clean.tweet_id)]
##### Test
df_api_clean.shape[0]
df_images_clean.shape[0]
##### Define
- Reseting all indexis
##### Code
df_twitter_archive_clean = df_twitter_archive_clean.reset_index(drop=True)
df_images_clean = df_images_clean.reset_index(drop=True)
df_api_clean = df_api_clean.reset_index(drop=True)
##### Test
df_twitter_archive_clean.tail()
df_images_clean.tail()
df_api_clean.tail()
##### Define
- Merge three DFs to one big DF
all_df = pd.merge(df_twitter_archive_clean,df_api_clean, on ='tweet_id' , how = 'left')
all_df = pd.merge(all_df,df_images_clean, on ='tweet_id' , how = 'left')
all_df
##### Define
- Convert tweet_ID to string
##### Code
all_df['tweet_id'] = all_df['tweet_id'].astype('str')
##### Test
all_df.info()
##### Define
- change time to datetime
##### Code
all_df['timestamp'] = pd.to_datetime(all_df['timestamp'])
##### Test
all_df.info()
##### Storing Data
##### Code
all_df.to_csv('twitter_archive_master.csv',index=False)
### seeing different rows analysis
all_df.describe()
### Insigts
- Average Ratings
- Max num of favourites
- Most Common name
#### Calculate the average ####
all_df.rating_numerator.mean()
#### Calculate the max num of favourites ####
all_df.favorite_count.max()
#### Calculate Most Common name ####
all_df.name.mode()[0]
### Plotting the relation between ratings and favourites
plt.scatter(all_df['rating_numerator'], all_df['favorite_count']);
plt.xlabel('numerator ratings');
plt.ylabel('favorite counts');
plt.title('Relation between ratings numerator and favourites ')
all_df['year']=all_df['timestamp'].dt.year
plt.bar(all_df['year'], all_df['favorite_count']);
plt.xlabel('Year');
plt.ylabel('favorite counts');
plt.xticks(range(2015,2018,1));
plt.title('Relation between years and favourites')