Skip to content

Extract mentions and hashtags from tweets

Extract hashtags and tweets from your twitter data with regex. This is a helpful first step in topic identification.

# Load packages
import pandas as pd 
import nltk
from nltk.tokenize import regexp_tokenize

import pandas as pd

# Upload your data as a csv or json file and load it as a data frame 
import pandas as pd
try:
    df = pd.read_csv('twitter.csv')
except:
    df = pd.read_json('twitter.json', orient='index').T

df.head()
# Define a regex pattern to find hashtags: pattern1
pattern1 = r"#\w+"

# Write a pattern that matches both mentions (@) and hashtags
pattern2 = r"(@\w+)"

hashtags = []
mentions = []
for tweet in df['text']:
    hashtags.append(nltk.tokenize.regexp_tokenize(tweet, pattern1))
    mentions.append(regexp_tokenize(tweet, pattern2))

df['mentions'] = mentions
df['hashtags'] = hashtags
df.head()