Skip to content
Snippets Groups Projects
Commit 821e1f36 authored by Christian Wulf's avatar Christian Wulf
Browse files

added File2SeqOfWords

parent 42443616
No related branches found
No related tags found
No related merge requests found
......@@ -28,12 +28,33 @@ import teetime.framework.OutputPort;
/**
* @author Christian Wulf
*
* @since 1.1
*
*/
public final class File2Lines extends AbstractConsumerStage<File> {
private final OutputPort<String> outputPort = this.createOutputPort();
private String charset = "UTF-8";
private final String charset;
/**
* <ol>
* <li>charset = UTF-8
* </ol>
*/
public File2Lines() {
this("UTF-8");
}
/**
*
* @param charset
* to be used when interpreting text files
*/
public File2Lines(final String charset) {
super();
this.charset = charset;
}
@Override
protected void execute(final File textFile) {
......@@ -66,10 +87,6 @@ public final class File2Lines extends AbstractConsumerStage<File> {
return this.charset;
}
public void setCharset(final String charset) {
this.charset = charset;
}
public OutputPort<String> getOutputPort() {
return outputPort;
}
......
/**
* Copyright (C) 2015 TeeTime (http://teetime.sourceforge.net)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package teetime.stage.io;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.CharBuffer;
import teetime.framework.AbstractConsumerStage;
import teetime.framework.OutputPort;
/**
* @author Christian Wulf
*
*/
public final class File2SeqOfWords extends AbstractConsumerStage<File> {
private final OutputPort<String> outputPort = this.createOutputPort();
private final String charset;
private final int bufferCapacity;
/**
* <ol>
* <li>charset = UTF-8
* <li>bufferCapacity = 1024
* </ol>
*/
public File2SeqOfWords() {
this("UTF-8", 1024);
}
public File2SeqOfWords(final String charset, final int bufferCapacity) {
super();
this.charset = charset;
this.bufferCapacity = bufferCapacity;
}
@Override
protected void execute(final File textFile) {
BufferedReader reader = null;
try {
reader = new BufferedReader(new InputStreamReader(new FileInputStream(textFile), this.charset));
CharBuffer charBuffer = CharBuffer.allocate(bufferCapacity);
while (reader.read(charBuffer) != -1) {
final int position = getPreviousWhitespacePosition(charBuffer);
if (-1 == position) {
if (logger.isErrorEnabled()) {
logger.error("A word in the following text file is bigger than the buffer's capacity: " + textFile.getAbsolutePath());
return;
}
}
final int limit = charBuffer.limit();
charBuffer.limit(position);
charBuffer.rewind();
outputPort.send(charBuffer.toString()); // from position to limit-1
charBuffer.limit(limit);
charBuffer.position(position);
charBuffer.compact();
}
} catch (final FileNotFoundException e) {
this.logger.error("", e);
} catch (final IOException e) {
this.logger.error("", e);
} finally {
try {
if (reader != null) {
reader.close();
}
} catch (final IOException e) {
this.logger.warn("", e);
}
}
}
private int getPreviousWhitespacePosition(final CharBuffer charBuffer) {
char[] characters = charBuffer.array();
int index = charBuffer.arrayOffset() + charBuffer.position() - 1;
while (index >= 0) {
switch (characters[index]) {
case ' ':
case '\n':
case '\r':
case '\t':
return index - charBuffer.arrayOffset();
default:
index--;
}
}
return -1;
}
public String getCharset() {
return this.charset;
}
public int getBufferCapacity() {
return bufferCapacity;
}
public OutputPort<String> getOutputPort() {
return outputPort;
}
}
......@@ -29,12 +29,37 @@ import teetime.stage.util.TextLine;
/**
* @author Christian Wulf
*
* @since 1.0
*
*/
public final class File2TextLinesFilter extends AbstractConsumerStage<File> {
private final OutputPort<TextLine> outputPort = this.createOutputPort();
private String charset = "UTF-8";
private final String charset;
/**
* <ol>
* <li>charset = UTF-8
* </ol>
*
* @since 1.1
*/
public File2TextLinesFilter() {
this("UTF-8");
}
/**
*
* @param charset
* to be used when interpreting text files
*
* @since 1.1
*/
public File2TextLinesFilter(final String charset) {
super();
this.charset = charset;
}
@Override
protected void execute(final File textFile) {
......@@ -67,10 +92,6 @@ public final class File2TextLinesFilter extends AbstractConsumerStage<File> {
return this.charset;
}
public void setCharset(final String charset) {
this.charset = charset;
}
public OutputPort<TextLine> getOutputPort() {
return outputPort;
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment