#!/usr/bin/perl use DBI; use English; use File::Find; # Connect to the database. my $geo_db = DBI->connect("DBI:mysql:database=geo;host=localhost", "xxxx", "xxxx", {'RaiseError' => 1}); # GSE .soft files are in this directory $data_files = "/mnt/mount1/geo/by_series"; $geo_db->do( "drop table if exists gse_desc" ); $geo_db->do( "create table if not exists gse_desc (gse int not null primary key, description longtext)" ); $geo_db->do( "drop table if exists gse_title" ); $geo_db->do( "create table if not exists gse_title (gse int not null primary key, title varchar(245))" ); $geo_db->do( "drop table if exists gse_gsm" ); $geo_db->do( "create table if not exists gse_gsm (gse int not null, index(gse), index(gsm), gsm int not null)" ); $geo_db->do( "drop table if exists gse_pubmed" ); $geo_db->do( "create table if not exists gse_pubmed (gse int not null, index(gse), pubmed bigint)" ); $geo_db->do( "drop table if exists gsm_title" ); $geo_db->do( "create table if not exists gsm_title (gsm int not null, index(gsm), title varchar(245))" ); $geo_db->do( "drop table if exists gsm_species" ); $geo_db->do( "create table if not exists gsm_species (gsm int not null, index(gsm), species varchar(32))" ); $geo_db->do( "drop table if exists gsm_desc" ); $geo_db->do( "create table if not exists gsm_desc (gsm int not null, index(gsm), description longtext)" ); $geo_db->do( "drop table if exists gsm_keyword" ); $geo_db->do( "create table if not exists gsm_keyword (gsm int not null, index(gsm), keyword varchar(245))" ); $geo_db->do( "drop table if exists gsm_source" ); $geo_db->do( "create table if not exists gsm_source (gsm int not null, index(gsm), source varchar(245))" ); $geo_db->do( "drop table if exists gsm_gpl" ); $geo_db->do( "create table if not exists gsm_gpl (gsm int not null, index(gsm), gpl int not null, index(gpl))" ); $insert_gse_title = $geo_db->prepare( "insert into gse_title set gse = ?, title = ?" ); $insert_gse_desc = $geo_db->prepare( "insert into gse_desc set gse = ?, description = ?" ); $insert_gse_pubmed = $geo_db->prepare( "insert into gse_pubmed set gse = ?, pubmed = ?" ); $insert_gse_gsm = $geo_db->prepare( "insert into gse_gsm set gse = ?, gsm = ?" ); $insert_gsm_title = $geo_db->prepare( "insert into gsm_title set gsm = ?, title = ?" ); $insert_gsm_desc = $geo_db->prepare( "insert into gsm_desc set gsm = ?, description = ?" ); $insert_gsm_keyword = $geo_db->prepare( "insert into gsm_keyword set gsm = ?, keyword = ?" ); $insert_gsm_source = $geo_db->prepare( "insert into gsm_source set gsm = ?, source = ?" ); $insert_gsm_species = $geo_db->prepare( "insert into gsm_species set gsm = ?, species = ?" ); $insert_gsm_gpl = $geo_db->prepare( "insert into gsm_gpl set gsm = ?, gpl = ?" ); sub trim { my @out = @_; for (@out) { s/^\s+//; s/\s+$//; } return wantarray ? @out : $out[0]; } find (\&wanted, $data_files); sub wanted { if( /GSE(.*)_family\.soft/ ) { $number = $1; print $_, "\n"; open( GEOFILE, $File::Find::name ) || warn "Unable to open $File::Find::name: $!\n"; $series_description = ""; $desc_mode = 0; THISFILE: while( ) { if( /^!Series_description = (.*)$/ ) { $series_description = $series_description . trim($1); $series_description = $series_description . " "; $desc_mode = 1; } elsif( $desc_mode == 1 ) { if( /^!Series_/ ) { $desc_mode = 0; if( length($series_description) > 1 ) { $insert_gse_desc->execute( $number, $series_description ); $series_description = ""; } } else { $line = trim($_); $series_description = $series_description . $line; $series_description = $series_description . " "; } } if( /^!Series_title = (.*)$/ ) { if( length($1) > 1 ) { $insert_gse_title->execute( $number, $1 ); } } if( /^!Series_pubmed_id = (.*)$/ ) { if( length($1) > 1 ) { $insert_gse_pubmed->execute( $number, $1 ); } } if( /^\^sample = GSM(.*)/ ) { $sample = $1; $insert_gse_gsm->execute( $number, $sample ); $sample_description = ""; } if( /^!Sample_description = (.*)$/ ) { $sample_description = $sample_description . trim($1); $sample_description = $sample_description . " "; $desc_mode = 3; } elsif( $desc_mode == 3 ) { if( /^!Sample_/ ) { $desc_mode = 0; if( length($sample_description) > 1 ) { $insert_gsm_desc->execute( $sample, $sample_description ); $sample_description = ""; } } else { $line = trim($_); $sample_description = $sample_description . $line; $sample_description = $sample_description . " "; } } if( /^!Sample_title = (.*)/ ) { $insert_gsm_title->execute( $sample, $1 ); } if( /^!Sample_keyword = (.*)/ ) { $insert_gsm_keyword->execute( $sample, $1 ); } if( /^!Sample_target_source = (.*)/ ) { $insert_gsm_source->execute( $sample, $1 ); } if( /^!Sample_platform_id = GPL(.*)/ ) { $insert_gsm_gpl->execute( $sample, $1 ); } if( /^!Sample_organism = (.*)$/ ) { $species = $1; if( length($species) > 1 ) { $insert_gsm_species->execute( $sample, $species ); } } } } }