from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
InteractiveShell.log_level = 'INFO'

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


%matplotlib inline
import pandas as pd
import numpy as np
import logging
import sys

from collections import OrderedDict

log = logging.getLogger()
log.handlers = []
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
log.setLevel(logging.INFO)
# log.setLevel(logging.DEBUG)
log.addHandler(ch)

pd.set_option('display.width',1000)
# pd.set_option('max_colwidth',200)
pd.reset_option('max_colwidth')


all_files = ['data/Top80_sPop&Rock.xlsx','data/TopAlt&Electronic.xlsx']
list_of_playlists= []
for file_ in all_files:
    xls_file = pd.ExcelFile(file_)
    df = xls_file.parse(0)  ## load first sheet - there should only be one
    
    df.drop_duplicates(subset=['Artist', 'Name'], inplace=True)
    log.info("## %s: df.shape=%s" % (file_, str(df.shape)))

    list_of_playlists.append(df)

inner_df = reduce(lambda left,right: pd.merge(left,right,on=['Artist','Name'], how='inner'), list_of_playlists)
merged_df = reduce(lambda left,right: pd.merge(left,right,on=['Artist','Name'], how='outer'), list_of_playlists)

log.info("inner_df.shape=%s" % str(inner_df.shape))
log.info("merged_df.shape=%s" % str(merged_df.shape))

df1_cnt=list_of_playlists[0].shape[0]
df2_cnt=list_of_playlists[1].shape[0]
df_inner_cnt=inner_df.shape[0]
df_union_cnt=df1_cnt+df2_cnt-df_inner_cnt
df_merged_cnt=merged_df.shape[0]

log.info("Rows in dataset1=%s" % df1_cnt)
log.info("Rows in dataset2=%s" % df2_cnt)
log.info("Rows in intersection=%s" % df_inner_cnt)
log.info("Rows in union=%s" % df_union_cnt)
log.info("(Rows in union==Rows in merged) = %s" % (df_union_cnt==df_merged_cnt))

2018-04-24 15:35:13,785 - INFO - ## data/Top80_sPop&Rock.xlsx: df.shape=(634, 31)
2018-04-24 15:35:13,857 - INFO - ## data/TopAlt&Electronic.xlsx: df.shape=(185, 31)
2018-04-24 15:35:13,882 - INFO - inner_df.shape=(67, 60)
2018-04-24 15:35:13,885 - INFO - merged_df.shape=(752, 60)
2018-04-24 15:35:13,888 - INFO - Rows in dataset1=634
2018-04-24 15:35:13,891 - INFO - Rows in dataset2=185
2018-04-24 15:35:13,892 - INFO - Rows in intersection=67
2018-04-24 15:35:13,894 - INFO - Rows in union=752
2018-04-24 15:35:13,897 - INFO - (Rows in union==Rows in merged) = True

Merging Excel Worksheets using Pandas

Setup notebook output options¶

Import standard libraries¶

Import Itunes Playlist Data¶

Comments