Friday, November 09, 2007

Easy Import from CSV or PSV to ActiveRecord

More late night coding has produced...

A mixin to help me import data from a KnowWho data export of legislative data!

The KnowWho flat files are either CSV or PSV. I used rake tasks to do the import. Here's the rake task for the house:

  #The column order in the KnowWho CSV/PSV file
#"ZipS"|"ZipL"|"LowS"|"LowL"|"HighS"|"HighL"|"District"|"DistrictDesc"|"Member"|"Prefix"|"FirstName"|"MiddleName"|"LastName"|"Suffix"|"Tag"|"PartyDesc"|
# 0 1 2 3 4 5 6 7 8 9 10 12 13 14 16 17
#"Room"|"Building"|"Street"|"City"|"State"|"ZipCode"|"Telephone"|"MemberID"|"PersonID"
# 18 19 20 21 22 23 24 25 26
desc "import KnowWho House PSV"
task :house_psv => :environment do
day = "2007-10-08".to_date
yell_psv("rake import:house BEGINS at " + Time.now.strftime("%m/%d/%Y %I:%M %p"), day, "House")
puts "rake import:house BEGINS at " + Time.now.strftime("%m/%d/%Y %I:%M %p")
require 'fastercsv'
file = "#{RAILS_ROOT}/lib/psv/" + day.strftime("%Y_%m_%d") + "/Zip9Digit_USHouse_PSV_100807.txt"
yell_psv("Importing file: " + file, day, "House")
count = 1
arr = Array.new

#Optimize the party query
parts = ["Democratic Party","Republican Party"]
parties = PoliticalParty.find(:all, :conditions => ["name in (?)", parts])

FasterCSV.foreach(file, :col_sep => "|",
:headers => :first_row,:return_headers => true,
:skip_blanks => true ) do |row|
puts "CSV ROW: #{count.to_s}"
unless row.header_row?
if row['LastName'] == 'Vacant' && row['FirstName'].blank?
yell_psv("Vacant!", day, "House")
puts "Vacant!"
leader_id = nil
else
party = parties.select {|x| x.name == row['PartyDesc']}[0]
political_party = party.blank? ? PoliticalParty.find_or_create_by_row(row, day) : party
leader = Leader.find_or_create_by_row(row, day, political_party.id)
leader_id = leader.id
end
district = HouseDistrict.find_or_create_by_row(row, day)
zip_range = ZipRange.find_or_create_by_row(row, day, district.id)
position = Position.find_or_create_representative(row, day, district.id, leader_id, Department.find_by_name("United States House of Representatives").id)
addr = Address.new(:street1=>row['Building'],:street2=>['Room'],:city=>row['City'],:state=>row['State'],
:postal_code=>row['ZipCode'],:country=>'USA')
phone = row['Telephone'].blank? ? nil : PhoneNumber.new(:number_type=>"default",:number=>row['Telephone'])
position.addresses << addr unless addr.nil?
position.phone_numbers << phone unless phone.nil?
else
puts "This is the Header Row"
end
count+=1
end
yell_psv("rake import:house ENDS at " + Time.now.strftime("%m/%d/%Y %I:%M %p"), day, "House")
puts "rake import:house ENDS at " + Time.now.strftime("%m/%d/%Y %I:%M %p")
end



As you can see I am importing a single flat file of data into many different tables.
As an example here is my Position class:

class Position < ActiveRecord::Base
validates_uniqueness_of :name
validates_uniqueness_of :house_district_id, :allow_nil => true
validates_uniqueness_of :senate_district_id, :allow_nil => true
validates_presence_of :name, :department_id
acts_as_addressable
acts_as_phoneable

belongs_to :department
belongs_to :house_district
belongs_to :senate_district
belongs_to :leader

# Add methods from ImportObject as class level methods (for instance level use include)
extend ImportObject

#"District"|"DistrictDesc"|"Room"|"Building"|
# 4 5 14 15
def self.find_or_create_senator(row, day, district_id, leader_id, department_id)
i = row['District']
obj = self.find_by_name(i)
hash = { "room" => row['Room'], "building" => row['Building'].split('- ')[1], "building_cd" => row['Building'].split('- ')[0],
"name" => row['District'], "department_id" => department_id,
"office_name" => row['DistrictDesc'], "senate_district_id" => district_id, "leader_id" => leader_id, "enabled" => true,
"data_date" => day, "data_source" => "KnowWho"}
del_arr = ["name", "data_source"]
return self.import_object(row, day, obj, hash, i, del_arr)
end

#"District"|"DistrictDesc"|"Room"|"Building"|
# 4 5 14 15
def self.find_or_create_representative(row, day, district_id, leader_id, department_id)
i = row['District']
obj = self.find_by_name(i)
hash = { "room" => row['Room'], "building" => row['Building'].split('- ')[1], "building_cd" => row['Building'].split('- ')[0],
"name" => row['District'], "department_id" => department_id,
"office_name" => row['DistrictDesc'], "house_district_id" => district_id, "leader_id" => leader_id, "enabled" => true,
"data_date" => day, "data_source" => "KnowWho"}
del_arr = ["name", "data_source"]
return self.import_object(row, day, obj, hash, i, del_arr)
end
end



And here's the all important mixin which gives each of my classes the ability to seamlessly import the ActiveRecord objects. I've imported several hundred thousand rows this way and nary a problem! This goes in a file called ImportObject.rb in the lib/ directory of the project, and is extended/included on a per model basis.

module ImportObject
def self.included(base)
# Initialize module.
end

def import_object(row, day, obj, hash, lookup_key, del_arr, skip = false)
klasss = self.to_s
if obj.blank?
yell_psv(klasss + " #{lookup_key} not found, creating new:", day, klasss)
puts klasss + " #{lookup_key} not found, creating new:"
obj = self.new
obj.attributes = obj.attributes.merge(hash)
if obj.save
puts klasss + " #{lookup_key} created"
yell_psv(klasss + " #{lookup_key} created.", day, klasss)
else
puts klasss + " #{lookup_key} falied to create."
yell_psv(klasss + " #{lookup_key} failed to create.", day, klasss)
end
return obj
elsif obj.data_date.is_a?(Date) && obj.data_date < day && !skip
del_arr.each {|x| hash.delete(x)}
obj.attributes = obj.attributes.merge(hash)
if obj.save
yell_psv(klasss + " #{lookup_key} updated.", day, klasss)
puts klasss + " #{lookup_key} updated."
return obj
else
yell_psv(klasss + " #{lookup_key} failed to update.", day, klasss)
puts klasss + " #{lookup_key} failed to update."
return nil
end
else
yell_psv(klasss + " #{lookup_key} skipped update.", day, klasss)
puts klasss + " #{lookup_key} skipped update."
return obj
end
end
end



Oh and for some Stupid Simple Logging of the import I created a yell_psv method in environment.rb:

def yell_psv(msg, date = Date.today, filename = "") 
# stupid simple logging:
f = File.open(File.expand_path(File.dirname(__FILE__) + "/../log/" + filename + date.strftime("%Y_%m_%d") + ".log"),"a")
f.puts msg
f.close
end



How 'bout them apples!

0 hashings:

This blog began on Monday, September 27, 2004.
Hits since 2/20/2007:
free hit counters
free hit counters