I found out this has something to do with the variation in length of input tokens from one inference to the next. It doesn't seem to like receiving lengths that vary greatly, maybe this causes some sort of weird fragmentation in GPU memory?? Here's the code that only extract IMDB sentences that has >512 tokens. And it is able to sustain GPU utilization, with ~30it/s.
from transformers import AutoTokenizer, TFDistilBertForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-cased')
from datasets import load_dataset
imdb = load_dataset('imdb')
print('starting collecting sentences with tokens >= 512')
sentences = [sentence for sentence in imdb['train']['text'] if tokenizer(sentence, truncation=True, return_tensors='tf')['input_ids'].shape[-1] >= 512]
print('finished collecting sentences with tokens >= 512')
for k, sentence in tqdm(enumerate(sentences)):
inputs = tokenizer(sentence, truncation=True, return_tensors='tf')
output = model(inputs).logits
pred = np.argmax(output.numpy(), axis=1)
if k % 100 == 0:
print(f"len(input_ids): {inputs['input_ids'].shape[-1]}")
print:
7it [00:00, 31.12it/s]
len(input_ids): 512
107it [00:03, 32.38it/s]
len(input_ids): 512
...
...
3804it [02:00, 31.85it/s]
len(input_ids): 512
3904it [02:03, 32.50it/s]
len(input_ids): 512
3946it [02:04, 31.70it/s]