# prom4.pl (GetPromoterInfo) # # Daniel Westreich, July 2002 # Mullins Lab, Microbiology Dept. # University of Washington # # Usage: prom4.pl filename # Alt usage: prom4.pl filename > out.htm # use IO::Socket; $host = "www.ncbi.nlm.nih.gov"; $frontcontigdoc = "/entrez/query.fcgi?cmd=Retrieve&db=nucleotide&list_uids="; $endcontigdoc = "&dopt=GenBank"; $error = false; open(OUT, ">promoterinfo.out"); while(<>) { if (/^\#/) #comments { next; } chomp($_); @words= split(/\|/); if (@words == 0) { next; } $gene = @words[0]; $gi = @words[1]; $strand = @words[2]; print "For gene ".$gene."\n"; $address = GetAddressFromContigURL($gene,$gi,$strand); if ($address =~ /Error/) { $error = true; } print OUT $address."\n"; } if ($error eq true) { print "** ERROR. Check log file promoterinfo.out for details.\n\n"; print "When errors are corrected, invoke next step with 'prom5.pl promoterinfo.out'"; exit(); } else { print "\n\nNo errors. Continue with step 5 (command line 'prom5.pl promoterinfo.out')? [Y/n]"; # if ( =~/^[nN]/) { print "You can invoke the next step of this process at any time with 'prom5.pl promoterinfo.out'"; } else { system("prom5.pl promoterinfo.out"); } } ###################### sub GetAddress { if ($_[0] eq "Crick") { $_[2] =~ /\,[0-9]+\.\.([0-9]+)\)/; if ($1 eq "") # then there is only one segment to the CDS { $_[2] =~ /[0-9]+\.\.([0-9]+)/; return $1; } return $1; } else # ($_[0] eq "Watson") { $_[1] =~ /join\(([0-9]+)\.\.[0-9]+/; if ($1 eq "") # then there's no "join" in the line { $_[1] =~ /([0-9]+)\.\.[0-9]+/; return $1; } return $1; } } sub GetAddressFromContigURL { $gene = $_[0]; $gi = $_[1]; $strand = $_[2]; $document = $frontcontigdoc.$gi.$endcontigdoc; print("**".$host.$document." **\n"); $remote = IO::Socket::INET->new( Proto => "tcp", PeerAddr => $host, PeerPort => "http(80)", ); unless($remote) { die "cannot connect to http daemon on $host" } $remote->autoflush(1); print $remote "GET $document HTTP/1.0\n\n"; $address = "#Error - could not find address in contig for $gene\n#Check host http://".$host.$document."\n\n"; $continue = 1; while (<$remote>) { if ($_ =~ /gene/) { $next = <$remote>; if ($next =~ /$gene/) { while(<$remote>) { if ($_ =~ /mRNA<\/a>\s+complement/ || $_ =~ /mRNA<\/a>\s+join/) { $firstRNA = $_; while(<$remote>) { if ($_ =~ /gene/) { last; } $lastRNA = $_; } $addressRNA = GetAddress($strand, $firstRNA, $lastRNA); print "address of RNA is ".$addressRNA."\n"; $address = $gene."|".$gi."|".$strand."|".$addressRNA; $continue = 0; last; } } } } if ($continue == 0) { last; } } -close $remote; if ($address =~ /Error/) { return $address; } return $address; }