diff --git a/src/main/java/teetime/stage/io/File2Lines.java b/src/main/java/teetime/stage/io/File2Lines.java index e19f9c6dc6bdbee14a2308344f8d04dead46d2b6..e4118b4769a6c4f268db199e99fbd1ff1cf38b73 100644 --- a/src/main/java/teetime/stage/io/File2Lines.java +++ b/src/main/java/teetime/stage/io/File2Lines.java @@ -28,12 +28,33 @@ import teetime.framework.OutputPort; /** * @author Christian Wulf * + * @since 1.1 + * */ public final class File2Lines extends AbstractConsumerStage<File> { private final OutputPort<String> outputPort = this.createOutputPort(); - private String charset = "UTF-8"; + private final String charset; + + /** + * <ol> + * <li>charset = UTF-8 + * </ol> + */ + public File2Lines() { + this("UTF-8"); + } + + /** + * + * @param charset + * to be used when interpreting text files + */ + public File2Lines(final String charset) { + super(); + this.charset = charset; + } @Override protected void execute(final File textFile) { @@ -66,10 +87,6 @@ public final class File2Lines extends AbstractConsumerStage<File> { return this.charset; } - public void setCharset(final String charset) { - this.charset = charset; - } - public OutputPort<String> getOutputPort() { return outputPort; } diff --git a/src/main/java/teetime/stage/io/File2SeqOfWords.java b/src/main/java/teetime/stage/io/File2SeqOfWords.java new file mode 100644 index 0000000000000000000000000000000000000000..d09589a66c86e07544ff00ce41733ad8f8144529 --- /dev/null +++ b/src/main/java/teetime/stage/io/File2SeqOfWords.java @@ -0,0 +1,125 @@ +/** + * Copyright (C) 2015 TeeTime (http://teetime.sourceforge.net) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package teetime.stage.io; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.CharBuffer; + +import teetime.framework.AbstractConsumerStage; +import teetime.framework.OutputPort; + +/** + * @author Christian Wulf + * + */ +public final class File2SeqOfWords extends AbstractConsumerStage<File> { + + private final OutputPort<String> outputPort = this.createOutputPort(); + + private final String charset; + private final int bufferCapacity; + + /** + * <ol> + * <li>charset = UTF-8 + * <li>bufferCapacity = 1024 + * </ol> + */ + public File2SeqOfWords() { + this("UTF-8", 1024); + } + + public File2SeqOfWords(final String charset, final int bufferCapacity) { + super(); + this.charset = charset; + this.bufferCapacity = bufferCapacity; + } + + @Override + protected void execute(final File textFile) { + BufferedReader reader = null; + try { + reader = new BufferedReader(new InputStreamReader(new FileInputStream(textFile), this.charset)); + CharBuffer charBuffer = CharBuffer.allocate(bufferCapacity); + while (reader.read(charBuffer) != -1) { + final int position = getPreviousWhitespacePosition(charBuffer); + if (-1 == position) { + if (logger.isErrorEnabled()) { + logger.error("A word in the following text file is bigger than the buffer's capacity: " + textFile.getAbsolutePath()); + return; + } + } + final int limit = charBuffer.limit(); + + charBuffer.limit(position); + charBuffer.rewind(); + outputPort.send(charBuffer.toString()); // from position to limit-1 + + charBuffer.limit(limit); + charBuffer.position(position); + charBuffer.compact(); + } + } catch (final FileNotFoundException e) { + this.logger.error("", e); + } catch (final IOException e) { + this.logger.error("", e); + } finally { + try { + if (reader != null) { + reader.close(); + } + } catch (final IOException e) { + this.logger.warn("", e); + } + } + } + + private int getPreviousWhitespacePosition(final CharBuffer charBuffer) { + char[] characters = charBuffer.array(); + int index = charBuffer.arrayOffset() + charBuffer.position() - 1; + + while (index >= 0) { + switch (characters[index]) { + case ' ': + case '\n': + case '\r': + case '\t': + return index - charBuffer.arrayOffset(); + default: + index--; + } + } + return -1; + } + + public String getCharset() { + return this.charset; + } + + public int getBufferCapacity() { + return bufferCapacity; + } + + public OutputPort<String> getOutputPort() { + return outputPort; + } + +} diff --git a/src/main/java/teetime/stage/io/File2TextLinesFilter.java b/src/main/java/teetime/stage/io/File2TextLinesFilter.java index 977fef59d491ea0a2663171537a25d33209ce1cf..e4861a77a29d1bd2d481aaeab2b50574447926cd 100644 --- a/src/main/java/teetime/stage/io/File2TextLinesFilter.java +++ b/src/main/java/teetime/stage/io/File2TextLinesFilter.java @@ -29,12 +29,37 @@ import teetime.stage.util.TextLine; /** * @author Christian Wulf * + * @since 1.0 + * */ public final class File2TextLinesFilter extends AbstractConsumerStage<File> { private final OutputPort<TextLine> outputPort = this.createOutputPort(); - private String charset = "UTF-8"; + private final String charset; + + /** + * <ol> + * <li>charset = UTF-8 + * </ol> + * + * @since 1.1 + */ + public File2TextLinesFilter() { + this("UTF-8"); + } + + /** + * + * @param charset + * to be used when interpreting text files + * + * @since 1.1 + */ + public File2TextLinesFilter(final String charset) { + super(); + this.charset = charset; + } @Override protected void execute(final File textFile) { @@ -67,10 +92,6 @@ public final class File2TextLinesFilter extends AbstractConsumerStage<File> { return this.charset; } - public void setCharset(final String charset) { - this.charset = charset; - } - public OutputPort<TextLine> getOutputPort() { return outputPort; }