001 /*
002 * RandomFactorSequenceGenerator.java
003 *
004 * Copyright 2003 Sergio Anibal de Carvalho Junior
005 *
006 * This file is part of NeoBio.
007 *
008 * NeoBio is free software; you can redistribute it and/or modify it under the terms of
009 * the GNU General Public License as published by the Free Software Foundation; either
010 * version 2 of the License, or (at your option) any later version.
011 *
012 * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
013 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
014 * PURPOSE. See the GNU General Public License for more details.
015 *
016 * You should have received a copy of the GNU General Public License along with NeoBio;
017 * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
018 * Boston, MA 02111-1307, USA.
019 *
020 * Proper attribution of the author as the source of the software would be appreciated.
021 *
022 * Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net
023 * Department of Computer Science http://www.dcs.kcl.ac.uk
024 * King's College London, UK http://www.kcl.ac.uk
025 *
026 * Please visit http://neobio.sourceforge.net
027 *
028 * This project was supervised by Professor Maxime Crochemore.
029 *
030 */
031
032 package neobio.textui;
033
034 import java.io.BufferedWriter;
035 import java.io.Writer;
036 import java.io.FileWriter;
037 import java.io.OutputStreamWriter;
038 import java.io.IOException;
039
040 /**
041 * This class is a simple command line based utility for generating random sequences with
042 * optimal LZ78 factorisation.
043 *
044 * <P>The main method takes three parameters from the command line to generate a
045 * sequence: <CODE>type</CODE>, <CODE>size</CODE> and <CODE>file</CODE>, where:
046 * <UL>
047 * <LI><B><CODE>type</CODE></B> is either <CODE>DNA</CODE> for DNA sequences or
048 * <CODE>PROT</CODE> for protein sequences.
049 * <LI><B><CODE>size</CODE></B> is the number os characters.
050 * <LI><B><CODE>file</CODE></B> (optional) is the name of a file (if ommited, sequence
051 * is written to standard output).
052 * </UL>
053 * </P>
054 *
055 * @author Sergio A. de Carvalho Jr.
056 */
057 public class RandomFactorSequenceGenerator
058 {
059 /**
060 * Character set for DNA sequences.
061 */
062 private static final char[] DNA_CHARS = {'A', 'C', 'G', 'T'};
063
064 /**
065 * Character set for protein sequences.
066 */
067 private static final char[] PROT_CHARS = {'A','R','N','D','C','Q','E','G','H','I',
068 'L','K','M','F','P','S','T','W','Y','V','B','Z','X'};
069
070 /**
071 * The main method takes three parameters from the command line to generate a
072 * sequence. See the class description for details.
073 *
074 * @param args command line arguments
075 */
076 public static void main (String[] args)
077 {
078 Writer output;
079 String seq_type, filename;
080 int size, random;
081 char[] charset;
082 int[] qty;
083 int[] factor;
084
085 try
086 {
087 // get 1st argument (required): file type
088 seq_type = args[0];
089
090 // get 2nd argument (required): number of characters
091 size = Integer.parseInt(args[1]);
092 }
093 catch (ArrayIndexOutOfBoundsException e)
094 {
095 usage();
096 System.exit(1);
097 return;
098 }
099 catch (NumberFormatException e)
100 {
101 usage();
102 System.exit(1);
103 return;
104 }
105
106 // validate character set
107 if (seq_type.equalsIgnoreCase("DNA"))
108 charset = DNA_CHARS;
109 else if (seq_type.equalsIgnoreCase("PROT"))
110 charset = PROT_CHARS;
111 else
112 {
113 // no such option
114 usage();
115 System.exit(1);
116 return;
117 }
118
119 // validate size
120 if (size <= 3)
121 {
122 System.err.println ("Error: size must be greater than 3.");
123 System.exit(1);
124 return;
125 }
126
127 try
128 {
129 // get 3rd argument (optional): file name
130 filename = args[2];
131
132 try
133 {
134 // open file for writing
135 output = new BufferedWriter (new FileWriter (filename));
136 }
137 catch (IOException e)
138 {
139 System.err.println ("Error: couldn't open " + filename + " for writing.");
140 e.printStackTrace();
141 System.exit(2);
142 return;
143 }
144 }
145 catch (ArrayIndexOutOfBoundsException e)
146 {
147 // file name was ommited, use standard output
148 filename = null;
149 output = new OutputStreamWriter (System.out);
150 }
151
152 // alocate an of characters statistics
153 qty = new int[charset.length];
154
155 // alocate an array to store the growing factor
156 // its size will be no greather than half sequence size
157 // (in fact, it's much less than that!)
158 factor = new int [size / 2];
159
160 try
161 {
162 int s = 0, i, f_size = 0;
163
164 // write sequence
165 while (s < size)
166 {
167 // copy previous factor
168 for (i = 0; i < f_size && s < size; i++)
169 {
170 output.write(charset[factor[i]]);
171
172 s++;
173
174 // keep track of how many characters
175 // have been writen of each type
176 qty[factor[i]]++;
177 }
178
179 if (s < size)
180 {
181
182 // choose a character index randomly
183 random = (int) (Math.random() * charset.length);
184
185 // extend factor with the random char index
186 factor[f_size++] = random;
187
188 // keep track of how many characters
189 // have been writen of each type
190 qty[random]++;
191
192 output.write(charset[random]);
193
194 s++;
195 }
196 }
197
198 output.flush();
199
200 if (filename != null) output.close();
201 }
202 catch (IOException e)
203 {
204 System.err.println ("Error: failed to write sequence.");
205 e.printStackTrace();
206 System.exit(2);
207 return;
208 }
209
210 // print character distribution
211 System.out.println("\nCharacter distribution:");
212 for (int i = 0; i < charset.length; i++)
213 System.err.println(charset[i] + ": " + qty[i]);
214
215 System.exit(0);
216 }
217
218 /**
219 * Prints command line usage.
220 */
221 private static void usage ()
222 {
223 System.err.println(
224 "\nUsage: RandomFactorSequenceGenerator <type> <size> [<file>]\n\n" +
225 "where:\n\n" +
226 " <type> = DNA for nucleotide sequences\n" +
227 " or PROT for protein sequences\n\n" +
228 " <size> = number os characters\n\n" +
229 " <file> = name of a file to where the sequence is to be written\n" +
230 " (if ommited, sequence is written to standard output)"
231 );
232 }
233 }