2016-07-13 2 views
0

My-Datensatz sieht wie folgt aus: https://www.dropbox.com/s/u4brzjnhac0pwnj/TEST.xlsx?dl=0Komplexe Datentransformation in Python

Ich brauche die Daten in der ursprünglichen Tabelle zu dem in der gewünschten Tabelle, in der angehängten Datei zu konvertieren.

Ich habe eine Reihe von household_ids (HH) mit Daten über Monate 1-7 für jeden Haushalt, ich will, dass für jeden HH/Monat ich Daten entsprechend den letzten 3 Monaten habe, neben diesem Datensatz und in der gleiche Reihe. Dies ist für jeden Monat für jeden Haushalt zu tun.

Es ist ein komplexes Problem im Text zu erklären, und ich denke, ein Blick auf die Daten mit erklären.

Ich habe einen Code dafür geschrieben, der sehr ineffizient ist und durch alle 5mn Datensätze des Datensatzes iteriert. An Tagen kann dies effizienter durchgeführt werden.

import pandas as pd 
import os 

os.chdir(r'H:\shared\tran') 

c=pd.read_csv(r'0.csv') 


c['_prev1_month_id']=''      
c['_prev1_tuned_duration']=''    
c['_prev1_weekend_tuned_duration']=''  
c['_prev1_channel_flips']=''     
c['_prev1_most_common_daypart']=''   
c['_prev1_programs_watched_per_hh']=''  
c['_prev1_midnight']=''      
c['_prev1_morning']=''      
c['_prev1_afternoon']=''      
c['_prev1_evening']='' 
c['_prev2_month_id']=''      
c['_prev2_tuned_duration']=''    
c['_prev2_weekend_tuned_duration']=''  
c['_prev2_channel_flips']=''     
c['_prev2_most_common_daypart']=''   
c['_prev2_programs_watched_per_hh']=''  
c['_prev2_midnight']=''      
c['_prev2_morning']=''      
c['_prev2_afternoon']=''      
c['_prev2_evening']=''      
c['_prev3_month_id']=''      
c['_prev3_tuned_duration']=''    
c['_prev3_weekend_tuned_duration']=''  
c['_prev3_channel_flips']=''     
c['_prev3_most_common_daypart']=''   
c['_prev3_programs_watched_per_hh']=''  
c['_prev3_midnight']=''      
c['_prev3_morning']=''      
c['_prev3_afternoon']=''      
c['_prev3_evening']='' 




def tran(v): 

    for i in v.month_id: 
     if i>3:   
      ind=v[v.month_id==i].index[0] 
      j=i-1 
      #print ('Doing m:',j,' ind:',v[v.month_id==i]['month_id'].get_values()[0]) 
      print ('index :',ind) 
      try: 
       c.ix[ind,'_prev1_month_id']=v[v.month_id==j]['month_id'].get_values()[0] 
       c.ix[ind,'_prev1_tuned_duration']=v[v.month_id==j]['tuned_duration'].get_values()[0] 
       c.ix[ind,'_prev1_weekend_tuned_duration']=v[v.month_id==j]['weekend_tuned_duration'].get_values()[0] 
       c.ix[ind,'_prev1_channel_flips']=v[v.month_id==j]['channel_flips'].get_values()[0] 
       c.ix[ind,'_prev1_most_common_daypart']=v[v.month_id==j]['most_common_daypart'].get_values()[0] 
       c.ix[ind,'_prev1_programs_watched_per_hh']=v[v.month_id==j]['most_common_daypart'].get_values()[0] 
       c.ix[ind,'_prev1_midnight']=v[v.month_id==j]['midnight'].get_values()[0] 
       c.ix[ind,'_prev1_morning']=v[v.month_id==j]['morning'].get_values()[0] 
       c.ix[ind,'_prev1_afternoon']=v[v.month_id==j]['afternoon'].get_values()[0] 
       c.ix[ind,'_prev1_evening']=v[v.month_id==j]['evening'].get_values()[0]  
      except : 
       #print ('No record found for HH ',v.household_id,' and month ',j) 
       pass 

      j=j-1 
      try: 
       c.ix[ind,'_prev2_month_id']=v[v.month_id==j]['month_id'].get_values()[0] 
       c.ix[ind,'_prev2_tuned_duration']=v[v.month_id==j]['tuned_duration'].get_values()[0] 
       c.ix[ind,'_prev2_weekend_tuned_duration']=v[v.month_id==j]['weekend_tuned_duration'].get_values()[0] 
       c.ix[ind,'_prev2_channel_flips']=v[v.month_id==j]['channel_flips'].get_values()[0] 
       c.ix[ind,'_prev2_most_common_daypart']=v[v.month_id==j]['most_common_daypart'].get_values()[0] 
       c.ix[ind,'_prev2_programs_watched_per_hh']=v[v.month_id==j]['most_common_daypart'].get_values()[0] 
       c.ix[ind,'_prev2_midnight']=v[v.month_id==j]['midnight'].get_values()[0] 
       c.ix[ind,'_prev2_morning']=v[v.month_id==j]['morning'].get_values()[0] 
       c.ix[ind,'_prev2_afternoon']=v[v.month_id==j]['afternoon'].get_values()[0] 
       c.ix[ind,'_prev2_evening']=v[v.month_id==j]['evening'].get_values()[0]    
      except: 
       #print ('No record found for HH ',v.household_id,' and month ',j) 
       pass 

      j=j-1 
      try:    
       c.ix[ind,'_prev3_month_id']=v[v.month_id==j]['month_id'].get_values()[0] 
       c.ix[ind,'_prev3_tuned_duration']=v[v.month_id==j]['tuned_duration'].get_values()[0] 
       c.ix[ind,'_prev3_weekend_tuned_duration']=v[v.month_id==j]['weekend_tuned_duration'].get_values()[0] 
       c.ix[ind,'_prev3_channel_flips']=v[v.month_id==j]['channel_flips'].get_values()[0] 
       c.ix[ind,'_prev3_most_common_daypart']=v[v.month_id==j]['most_common_daypart'].get_values()[0] 
       c.ix[ind,'_prev3_programs_watched_per_hh']=v[v.month_id==j]['most_common_daypart'].get_values()[0] 
       c.ix[ind,'_prev3_midnight']=v[v.month_id==j]['midnight'].get_values()[0] 
       c.ix[ind,'_prev3_morning']=v[v.month_id==j]['morning'].get_values()[0] 
       c.ix[ind,'_prev3_afternoon']=v[v.month_id==j]['afternoon'].get_values()[0] 
       c.ix[ind,'_prev3_evening']=v[v.month_id==j]['evening'].get_values()[0] 
      except: 
       pass 
       #print ('No record found for HH ',v.household_id,' and month ',j) 


     else: 
      #print ('Ignored for HH ',v.household_id,' and month ',j) 
      pass 


z.head() 
m=0 
for k in z.household_id.unique(): 
    for i in list(z[z['household_id']==k].month_id): 
     if i >3: 
      j=i-1 

      #index of original row 
      ind=z[(z.household_id==k) & (z.month_id==i)].index[0] 
      print ('Doing : hh:',k,' m:',i,' ind:',ind) 
      try: 
       z['_prev1_month_id'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['month_id'].get_values()[0] 
       z['_prev1_tuned_duration'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['tuned_duration'].get_values()[0]     
       z['_prev1_weekend_tuned_duration'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['weekend_tuned_duration'].get_values()[0] 
       z['_prev1_channel_flips'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['channel_flips'].get_values()[0] 
       z['_prev1_most_common_daypart'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['most_common_daypart'].get_values()[0] 
       z['_prev1_programs_watched_per_hh'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['programs_watched_per_hh'].get_values()[0] 
       z['_prev1_midnight'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['midnight'].get_values()[0] 
       z['_prev1_morning'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['morning'].get_values()[0] 
       z['_prev1_afternoon'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['afternoon'].get_values()[0] 
       z['_prev1_evening'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['evening'].get_values()[0] 

      except : 
       print ('No record found for HH ',k,' and month ',j) 
      j=j-1 
      try: 
       z['_prev2_month_id'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['month_id'].get_values()[0] 
       z['_prev3_tuned_duration'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['tuned_duration'].get_values()[0]     
       z['_prev2_weekend_tuned_duration'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['weekend_tuned_duration'].get_values()[0] 
       z['_prev2_channel_flips'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['channel_flips'].get_values()[0] 
       z['_prev2_most_common_daypart'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['most_common_daypart'].get_values()[0] 
       z['_prev2_programs_watched_per_hh'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['programs_watched_per_hh'].get_values()[0] 
       z['_prev2_midnight'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['midnight'].get_values()[0] 
       z['_prev2_morning'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['morning'].get_values()[0] 
       z['_prev2_afternoon'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['afternoon'].get_values()[0] 
       z['_prev2_evening'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['evening'].get_values()[0] 

      except: 
       print ('No record found for HH ',k,' and month ',j) 

      j=j-1 
      try:    
       z['_prev3_month_id'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['month_id'].get_values()[0] 
       z['_prev3_tuned_duration'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['tuned_duration'].get_values()[0]     
       z['_prev3_weekend_tuned_duration'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['weekend_tuned_duration'].get_values()[0] 
       z['_prev3_channel_flips'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['channel_flips'].get_values()[0] 
       z['_prev3_most_common_daypart'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['most_common_daypart'].get_values()[0] 
       z['_prev3_programs_watched_per_hh'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['programs_watched_per_hh'].get_values()[0] 
       z['_prev3_midnight'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['midnight'].get_values()[0] 
       z['_prev3_morning'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['morning'].get_values()[0] 
       z['_prev3_afternoon'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['afternoon'].get_values()[0] 
       z['_prev3_evening'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['evening'].get_values()[0] 
      except: 
       print ('No record found for HH ',k,' and month ',j) 


     else: 
      print ('Ignored for HH ',k,' and month ',j) 

Rufen Sie die Funktion:

#c.groupby('household_id').apply(tran) 
+3

Bitte fügen Sie solche aktuellen Daten und das gewünschte Ergebnis in den Körper Ihrer Frage ein. Denken Sie daran, dass Ihre Zielgruppe auch für zukünftige Leser gedacht ist, für die Ihnen möglicherweise kein Link zur Verfügung steht. – Parfait

Antwort

0

Obwohl ich es nicht tun könnte nicht schneller in Python, ich konnte es in SQL tun ziemlich schnell Analytics-Funktion.

select month_id, household_id, tuned_duration,weekend_tuned_duration,channel_flips,most_common_daypart, 
programs_watched_per_hh,trend,midnight,morning,afternoon,evening, 
--rank() over (partition by month_id desc, household_id order by month_id desc desc 
lead (month_id) over (partition by household_id order by month_id desc) as month_id_1, 
lead (household_id) over (partition by household_id order by month_id desc) as household_id_1, 
lead (tuned_duration) over (partition by household_id order by month_id desc) as tuned_duration_1, 
lead (weekend_tuned_duration) over (partition by household_id order by month_id desc) as weekend_tuned_duration_1, 
lead (channel_flips) over (partition by household_id order by month_id desc) as channel_flips_1, 
lead (most_common_daypart) over (partition by household_id order by month_id desc) as most_common_daypart_1, 
lead (programs_watched_per_hh) over (partition by household_id order by month_id desc) as program_watched_per_hh_1, 
lead (trend) over (partition by household_id order by month_id desc) as trend_1, 
lead (midnight) over (partition by household_id order by month_id desc) as midnight_1, 
lead (morning) over (partition by household_id order by month_id desc) as morning_1, 
lead (afternoon) over (partition by household_id order by month_id desc) as afternoon_1, 
lead (evening) over (partition by household_id order by month_id desc) as evening_1, 
lead (month_id,2) over (partition by household_id order by month_id desc) as month_id_2, 
lead (household_id,2) over (partition by household_id order by month_id desc) as household_id_2, 
lead (tuned_duration,2) over (partition by household_id order by month_id desc) as tuned_duration_2, 
lead (weekend_tuned_duration,2) over (partition by household_id order by month_id desc) as weekend_tuned_duration_2, 
lead (channel_flips,2) over (partition by household_id order by month_id desc) as channel_flips_2, 
lead (most_common_daypart,2) over (partition by household_id order by month_id desc) as most_common_daypart_2, 
lead (programs_watched_per_hh,2) over (partition by household_id order by month_id desc) as program_watched_per_hh_2, 
lead (trend,2) over (partition by household_id order by month_id desc) as trend_2, 
lead (midnight,2) over (partition by household_id order by month_id desc) as midnight_2, 
lead (morning,2) over (partition by household_id order by month_id desc) as morning_2, 
lead (afternoon,2) over (partition by household_id order by month_id desc) as afternoon_2, 
lead (evening,2) over (partition by household_id order by month_id desc) as evening_2, 
lead (month_id,3) over (partition by household_id order by month_id desc) as month_id_3, 
lead (household_id,3) over (partition by household_id order by month_id desc) as household_id_3, 
lead (tuned_duration,3) over (partition by household_id order by month_id desc) as tuned_duration_3, 
lead (weekend_tuned_duration,3) over (partition by household_id order by month_id desc) as weekend_tuned_duration_3, 
lead (channel_flips,3) over (partition by household_id order by month_id desc) as channel_flips_3, 
lead (most_common_daypart,3) over (partition by household_id order by month_id desc) as most_common_daypart_3, 
lead (programs_watched_per_hh,3) over (partition by household_id order by month_id desc) as program_watched_per_hh_3, 
lead (trend,3) over (partition by household_id order by month_id desc) as trend_3, 
lead (midnight,3) over (partition by household_id order by month_id desc) as midnight_3, 
lead (morning,3) over (partition by household_id order by month_id desc) as morning_3, 
lead (afternoon,3) over (partition by household_id order by month_id desc) as afternoon_3, 
lead (evening,3) over (partition by household_id order by month_id desc) as evening_3 
from 
table