2016-04-20 9 views
0

Ich bekomme eine abgeschnittene Daten Warnung, wenn ich versuche, die zugewiesenen Werte in die angegebene MySQL-Tabelle zu importieren.Warnung beim Versuch, Scraped-Werte in MySQL-Tabelle zu importieren - mit Python und BS4

Die Werte werden in der SQL-Tabelle ohne "US" gespeichert. und eines der Daten, die eine Menge "" danach haben.

Was kann ich tun, um die Werte zu konvertieren, um die abgeschnittenen Werte angenehmer zu machen?

: Warning: Data truncated for column 'last_count' at row 1 
cur.execute('INSERT IGNORE INTO RIGCOUNT (area, last_count, count, change_from_prior_count, date_of_prior_count, change_from_last_year, date_of_last_year_count) VALUES (\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\")',(area, last_count, count, change_from_prior_count, date_of_prior_count, change_from_last_year, date_of_last_year_count)) 
{'count': u'440', 'area': u'U.S.', 'change_from_prior_count': u'-3', 'last_count': u'15 April\r\n     2016', 'change_from_last_year': u'-514', 'date_of_last_year_count': u'17 April 2015', 'date_of_prior_count': u'8 April 2016'} 

import scraperwiki 
import requests 
from bs4 import BeautifulSoup 
import csv 
import MySQLdb 

#mysql portion 
mydb = MySQLdb.connect(host='localhost', 
     user= '********', 
     passwd='*******', 
     db='testdb') 
cur = mydb.cursor() 

def store (area, last_count, count, change_from_prior_count, date_of_prior_count, change_from_last_year, date_of_last_year_count): 
    cur.execute('INSERT IGNORE INTO RIGCOUNT (area, last_count, count, change_from_prior_count, date_of_prior_count, change_from_last_year, date_of_last_year_count) VALUES (\"%s\",$ 
    cur.connection.commit() 


base_url = 'http://phx.corporate-ir.net/phoenix.zhtml?c=79687&p=irol-rigcountsoverview' 
html = requests.get(base_url) 
soup = BeautifulSoup(html.content, "html.parser") 

table = soup.findAll('table') 
rows = table[1].findAll("tr") 
if len(soup.findAll('tr')) > 0: 
    rows = rows[1:] 
for row in rows: 
    cells = row.findAll('td') 
    area = cells[0].get_text() 
    last_count = cells[1].get_text() 
    count = cells[2].get_text() 
    change_from_prior_count = cells[3].get_text() 
    date_of_prior_count = cells[4].get_text() 
    change_from_last_year = cells[5].get_text() 
    date_of_last_year_count = cells[6].get_text() 
    store(area, last_count, count, change_from_prior_count, date_of_prior_count, change_from_last_year, date_of_last_year_count) 
    data = { 
     'area': cells[0].get_text(), 
     'last_count': cells[1].get_text(), 
     'count': cells[2].get_text(), 
     'change_from_prior_count': cells[3].get_text(), 
     'date_of_prior_count': cells[4].get_text(), 
     'change_from_last_year': cells[5].get_text(), 
     'date_of_last_year_count': cells[6].get_text(), 
    } 

    print data 
    print '\n' 
mydb.close() 

Antwort

0

... one of the dates that has a massive amount of " " after it.

Sie können mit nur einer mit einem regulären Ausdruck mehrere whispaces ersetzen.

import scraperwiki 
import requests 
from bs4 import BeautifulSoup 
import csv 
import MySQLdb 
import re 

#mysql portion 
mydb = MySQLdb.connect(host='localhost', 
     user= '******', 
     passwd='******', 
     db='testdb') 
cur = mydb.cursor() 


def store (area, last_count, count, change_from_prior_count, date_of_prior_count, change_from_last_year, date_of_last_year_count): 
    cur.execute('INSERT IGNORE INTO RIGCOUNT (area, last_count, count, change_from_prior_count, date_of_prior_count, change_from_last_year, date_of_last_year_count) VALUES (\"%s\",$ 
    cur.connection.commit() 


base_url = 'http://phx.corporate-ir.net/phoenix.zhtml?c=79687&p=irol-rigcountsoverview' 
html = requests.get(base_url) 
soup = BeautifulSoup(html.content, "html.parser") 

table = soup.findAll('table') 
rows = table[1].findAll("tr") 
if len(soup.findAll('tr')) > 0: 
    rows = rows[1:] 

white_space_pattern = re.compile(r"\s+") 

for row in rows: 
    cells = row.findAll('td') 
    cells = [white_space_pattern.sub(' ', cell.get_text()) for cell in cells] 

    area = cells[0] 
    last_count = cells[1] 
    count = cells[2] 
    change_from_prior_count = cells[3] 
    date_of_prior_count = cells[4] 
    change_from_last_year = cells[5] 
    date_of_last_year_count = cells[6] 

    store(area, last_count, count, change_from_prior_count, date_of_prior_count, change_from_last_year, date_of_last_year_count) 
    data = { 
     'area': area, 
     'last_count': last_count, 
     'count': count, 
     'change_from_prior_count': change_from_prior_count, 
     'date_of_prior_count': date_of_prior_count, 
     'change_from_last_year': change_from_last_year, 
     'date_of_last_year_count': date_of_last_year_count, 
    } 

    print data 
    print '\n' 
mydb.close() 
+0

Das ist also die neue Fehler Linie 33, in area = Zellen [0] .get_text() Indexerror: Listenindex –

+0

außerhalb des zulässigen Bereichs @citramaillo aktualisiert, vergessen jene 'get_text()' Anrufe zu entfernen – dm295

+0

noch einen Wurf bekommen - Traceback (jüngste Aufforderung zuletzt): File "rigcount2.py", Zeile 34, in area = Zellen [0] Indexerror: Listenindex jetzt außer Reichweite –