My-Datensatz sieht wie folgt aus: https://www.dropbox.com/s/u4brzjnhac0pwnj/TEST.xlsx?dl=0Komplexe Datentransformation in Python
Ich brauche die Daten in der ursprünglichen Tabelle zu dem in der gewünschten Tabelle, in der angehängten Datei zu konvertieren.
Ich habe eine Reihe von household_ids (HH) mit Daten über Monate 1-7 für jeden Haushalt, ich will, dass für jeden HH/Monat ich Daten entsprechend den letzten 3 Monaten habe, neben diesem Datensatz und in der gleiche Reihe. Dies ist für jeden Monat für jeden Haushalt zu tun.
Es ist ein komplexes Problem im Text zu erklären, und ich denke, ein Blick auf die Daten mit erklären.
Ich habe einen Code dafür geschrieben, der sehr ineffizient ist und durch alle 5mn Datensätze des Datensatzes iteriert. An Tagen kann dies effizienter durchgeführt werden.
import pandas as pd
import os
os.chdir(r'H:\shared\tran')
c=pd.read_csv(r'0.csv')
c['_prev1_month_id']=''
c['_prev1_tuned_duration']=''
c['_prev1_weekend_tuned_duration']=''
c['_prev1_channel_flips']=''
c['_prev1_most_common_daypart']=''
c['_prev1_programs_watched_per_hh']=''
c['_prev1_midnight']=''
c['_prev1_morning']=''
c['_prev1_afternoon']=''
c['_prev1_evening']=''
c['_prev2_month_id']=''
c['_prev2_tuned_duration']=''
c['_prev2_weekend_tuned_duration']=''
c['_prev2_channel_flips']=''
c['_prev2_most_common_daypart']=''
c['_prev2_programs_watched_per_hh']=''
c['_prev2_midnight']=''
c['_prev2_morning']=''
c['_prev2_afternoon']=''
c['_prev2_evening']=''
c['_prev3_month_id']=''
c['_prev3_tuned_duration']=''
c['_prev3_weekend_tuned_duration']=''
c['_prev3_channel_flips']=''
c['_prev3_most_common_daypart']=''
c['_prev3_programs_watched_per_hh']=''
c['_prev3_midnight']=''
c['_prev3_morning']=''
c['_prev3_afternoon']=''
c['_prev3_evening']=''
def tran(v):
for i in v.month_id:
if i>3:
ind=v[v.month_id==i].index[0]
j=i-1
#print ('Doing m:',j,' ind:',v[v.month_id==i]['month_id'].get_values()[0])
print ('index :',ind)
try:
c.ix[ind,'_prev1_month_id']=v[v.month_id==j]['month_id'].get_values()[0]
c.ix[ind,'_prev1_tuned_duration']=v[v.month_id==j]['tuned_duration'].get_values()[0]
c.ix[ind,'_prev1_weekend_tuned_duration']=v[v.month_id==j]['weekend_tuned_duration'].get_values()[0]
c.ix[ind,'_prev1_channel_flips']=v[v.month_id==j]['channel_flips'].get_values()[0]
c.ix[ind,'_prev1_most_common_daypart']=v[v.month_id==j]['most_common_daypart'].get_values()[0]
c.ix[ind,'_prev1_programs_watched_per_hh']=v[v.month_id==j]['most_common_daypart'].get_values()[0]
c.ix[ind,'_prev1_midnight']=v[v.month_id==j]['midnight'].get_values()[0]
c.ix[ind,'_prev1_morning']=v[v.month_id==j]['morning'].get_values()[0]
c.ix[ind,'_prev1_afternoon']=v[v.month_id==j]['afternoon'].get_values()[0]
c.ix[ind,'_prev1_evening']=v[v.month_id==j]['evening'].get_values()[0]
except :
#print ('No record found for HH ',v.household_id,' and month ',j)
pass
j=j-1
try:
c.ix[ind,'_prev2_month_id']=v[v.month_id==j]['month_id'].get_values()[0]
c.ix[ind,'_prev2_tuned_duration']=v[v.month_id==j]['tuned_duration'].get_values()[0]
c.ix[ind,'_prev2_weekend_tuned_duration']=v[v.month_id==j]['weekend_tuned_duration'].get_values()[0]
c.ix[ind,'_prev2_channel_flips']=v[v.month_id==j]['channel_flips'].get_values()[0]
c.ix[ind,'_prev2_most_common_daypart']=v[v.month_id==j]['most_common_daypart'].get_values()[0]
c.ix[ind,'_prev2_programs_watched_per_hh']=v[v.month_id==j]['most_common_daypart'].get_values()[0]
c.ix[ind,'_prev2_midnight']=v[v.month_id==j]['midnight'].get_values()[0]
c.ix[ind,'_prev2_morning']=v[v.month_id==j]['morning'].get_values()[0]
c.ix[ind,'_prev2_afternoon']=v[v.month_id==j]['afternoon'].get_values()[0]
c.ix[ind,'_prev2_evening']=v[v.month_id==j]['evening'].get_values()[0]
except:
#print ('No record found for HH ',v.household_id,' and month ',j)
pass
j=j-1
try:
c.ix[ind,'_prev3_month_id']=v[v.month_id==j]['month_id'].get_values()[0]
c.ix[ind,'_prev3_tuned_duration']=v[v.month_id==j]['tuned_duration'].get_values()[0]
c.ix[ind,'_prev3_weekend_tuned_duration']=v[v.month_id==j]['weekend_tuned_duration'].get_values()[0]
c.ix[ind,'_prev3_channel_flips']=v[v.month_id==j]['channel_flips'].get_values()[0]
c.ix[ind,'_prev3_most_common_daypart']=v[v.month_id==j]['most_common_daypart'].get_values()[0]
c.ix[ind,'_prev3_programs_watched_per_hh']=v[v.month_id==j]['most_common_daypart'].get_values()[0]
c.ix[ind,'_prev3_midnight']=v[v.month_id==j]['midnight'].get_values()[0]
c.ix[ind,'_prev3_morning']=v[v.month_id==j]['morning'].get_values()[0]
c.ix[ind,'_prev3_afternoon']=v[v.month_id==j]['afternoon'].get_values()[0]
c.ix[ind,'_prev3_evening']=v[v.month_id==j]['evening'].get_values()[0]
except:
pass
#print ('No record found for HH ',v.household_id,' and month ',j)
else:
#print ('Ignored for HH ',v.household_id,' and month ',j)
pass
z.head()
m=0
for k in z.household_id.unique():
for i in list(z[z['household_id']==k].month_id):
if i >3:
j=i-1
#index of original row
ind=z[(z.household_id==k) & (z.month_id==i)].index[0]
print ('Doing : hh:',k,' m:',i,' ind:',ind)
try:
z['_prev1_month_id'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['month_id'].get_values()[0]
z['_prev1_tuned_duration'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['tuned_duration'].get_values()[0]
z['_prev1_weekend_tuned_duration'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['weekend_tuned_duration'].get_values()[0]
z['_prev1_channel_flips'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['channel_flips'].get_values()[0]
z['_prev1_most_common_daypart'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['most_common_daypart'].get_values()[0]
z['_prev1_programs_watched_per_hh'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['programs_watched_per_hh'].get_values()[0]
z['_prev1_midnight'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['midnight'].get_values()[0]
z['_prev1_morning'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['morning'].get_values()[0]
z['_prev1_afternoon'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['afternoon'].get_values()[0]
z['_prev1_evening'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['evening'].get_values()[0]
except :
print ('No record found for HH ',k,' and month ',j)
j=j-1
try:
z['_prev2_month_id'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['month_id'].get_values()[0]
z['_prev3_tuned_duration'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['tuned_duration'].get_values()[0]
z['_prev2_weekend_tuned_duration'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['weekend_tuned_duration'].get_values()[0]
z['_prev2_channel_flips'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['channel_flips'].get_values()[0]
z['_prev2_most_common_daypart'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['most_common_daypart'].get_values()[0]
z['_prev2_programs_watched_per_hh'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['programs_watched_per_hh'].get_values()[0]
z['_prev2_midnight'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['midnight'].get_values()[0]
z['_prev2_morning'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['morning'].get_values()[0]
z['_prev2_afternoon'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['afternoon'].get_values()[0]
z['_prev2_evening'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['evening'].get_values()[0]
except:
print ('No record found for HH ',k,' and month ',j)
j=j-1
try:
z['_prev3_month_id'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['month_id'].get_values()[0]
z['_prev3_tuned_duration'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['tuned_duration'].get_values()[0]
z['_prev3_weekend_tuned_duration'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['weekend_tuned_duration'].get_values()[0]
z['_prev3_channel_flips'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['channel_flips'].get_values()[0]
z['_prev3_most_common_daypart'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['most_common_daypart'].get_values()[0]
z['_prev3_programs_watched_per_hh'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['programs_watched_per_hh'].get_values()[0]
z['_prev3_midnight'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['midnight'].get_values()[0]
z['_prev3_morning'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['morning'].get_values()[0]
z['_prev3_afternoon'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['afternoon'].get_values()[0]
z['_prev3_evening'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['evening'].get_values()[0]
except:
print ('No record found for HH ',k,' and month ',j)
else:
print ('Ignored for HH ',k,' and month ',j)
Rufen Sie die Funktion:
#c.groupby('household_id').apply(tran)
Bitte fügen Sie solche aktuellen Daten und das gewünschte Ergebnis in den Körper Ihrer Frage ein. Denken Sie daran, dass Ihre Zielgruppe auch für zukünftige Leser gedacht ist, für die Ihnen möglicherweise kein Link zur Verfügung steht. – Parfait